From 832d1b1c812acff0a2e01799a0a59d69ebcc7d25 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Thu, 24 Jul 2025 11:16:00 -0700
Subject: [PATCH 1/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
 =?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4

[skip ci]
---
 .ci/compute_projects.py                       |   19 +
 .ci/compute_projects_test.py                  |   34 +-
 .ci/monolithic-linux.sh                       |   14 +-
 .ci/monolithic-windows.sh                     |    5 -
 .github/CODEOWNERS                            |    6 +-
 .github/new-prs-labeler.yml                   |    5 +
 .../containers/github-action-ci/Dockerfile    |   14 +-
 .github/workflows/premerge.yaml               |    5 +-
 bolt/include/bolt/Core/BinaryContext.h        |   17 +-
 bolt/include/bolt/Core/BinaryFunction.h       |    8 +
 bolt/include/bolt/Passes/LongJmp.h            |   33 +
 bolt/lib/Core/BinaryContext.cpp               |   52 +-
 bolt/lib/Core/BinaryEmitter.cpp               |    6 +-
 bolt/lib/Core/BinaryFunction.cpp              |    9 +
 bolt/lib/Passes/Aligner.cpp                   |    5 +
 bolt/lib/Passes/BinaryPasses.cpp              |    2 +
 bolt/lib/Passes/LongJmp.cpp                   |  265 +-
 bolt/lib/Passes/PatchEntries.cpp              |   16 +-
 bolt/lib/Passes/SplitFunctions.cpp            |    6 +-
 bolt/lib/Profile/DataAggregator.cpp           |    9 +-
 bolt/lib/Rewrite/BinaryPassManager.cpp        |    6 +-
 bolt/lib/Rewrite/RewriteInstance.cpp          |   14 +-
 bolt/test/AArch64/veneer-bolt.s               |   66 +
 bolt/test/X86/cdsplit-call-scale.s            |    2 +-
 clang-tools-extra/README.txt                  |    9 +-
 clang-tools-extra/clang-doc/JSONGenerator.cpp |   20 +-
 .../bugprone/MoveForwardingReferenceCheck.cpp |    2 +-
 .../clang-tidy/misc/UnusedAliasDeclsCheck.cpp |    3 +-
 .../utils/RenamerClangTidyCheck.cpp           |    3 +-
 clang-tools-extra/clangd/AST.cpp              |   10 +-
 clang-tools-extra/clangd/CodeComplete.cpp     |    1 -
 clang-tools-extra/clangd/DumpAST.cpp          |    3 -
 clang-tools-extra/clangd/FindTarget.cpp       |    3 -
 clang-tools-extra/clangd/IncludeFixer.cpp     |   36 +-
 clang-tools-extra/clangd/ModulesBuilder.cpp   |    3 +-
 .../clangd/refactor/tweaks/AddUsing.cpp       |    8 +-
 clang-tools-extra/docs/ReleaseNotes.rst       |  290 +-
 .../docs/clang-tidy/Contributing.rst          |   23 +
 .../include-cleaner/lib/WalkAST.cpp           |    1 -
 .../test/clang-query/trailing-comma.c         |   21 +
 clang/docs/CXXTypeAwareAllocators.rst         |  155 +
 clang/docs/ClangFormatStyleOptions.rst        |   10 +
 clang/docs/LanguageExtensions.rst             |    8 +
 clang/docs/LibClang.rst                       |    6 +
 clang/docs/ReleaseNotes.rst                   |   46 +-
 clang/docs/SanitizerSpecialCaseList.rst       |    1 +
 clang/docs/ThinLTO.rst                        |   32 +
 clang/docs/index.rst                          |    1 +
 clang/include/clang-c/Index.h                 |   15 +
 clang/include/clang/AST/ASTContext.h          |   12 +-
 clang/include/clang/AST/AbstractBasicReader.h |    7 +-
 clang/include/clang/AST/AbstractBasicWriter.h |    6 +-
 clang/include/clang/AST/Decl.h                |   27 +-
 clang/include/clang/AST/DeclCXX.h             |   24 +-
 clang/include/clang/AST/NestedNameSpecifier.h |   47 +-
 clang/include/clang/AST/PropertiesBase.td     |    1 +
 clang/include/clang/AST/RecursiveASTVisitor.h |    2 -
 clang/include/clang/ASTMatchers/ASTMatchers.h |    9 +-
 .../Analysis/Analyses/UninitializedValues.h   |    6 +
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   49 +
 clang/include/clang/Basic/BuiltinsPPC.def     |    6 +
 clang/include/clang/Basic/BuiltinsRISCV.td    |    5 +
 .../clang/Basic/BuiltinsRISCVXAndes.td        |   27 +
 clang/include/clang/Basic/BuiltinsSPIRVVK.td  |    1 +
 clang/include/clang/Basic/CodeGenOptions.def  |   32 +-
 clang/include/clang/Basic/CodeGenOptions.h    |   40 +-
 clang/include/clang/Basic/DebugOptions.def    |   58 +-
 clang/include/clang/Basic/DeclNodes.td        |    5 +-
 .../include/clang/Basic/DiagnosticASTKinds.td |   12 +-
 clang/include/clang/Basic/DiagnosticGroups.td |    5 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   42 +-
 .../Basic/DiagnosticSerializationKinds.td     |    4 +
 clang/include/clang/Basic/Features.def        |    9 +
 clang/include/clang/Basic/LangOptions.def     |   10 +-
 clang/include/clang/Basic/LangOptions.h       |   24 -
 .../include/clang/Basic/PointerAuthOptions.h  |   35 +
 clang/include/clang/Basic/TargetInfo.h        |   11 +-
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |   21 +
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  |   25 +-
 .../include/clang/CIR/Dialect/IR/CIRDialect.h |    6 +-
 .../clang/CIR/Dialect/IR/CIRDialect.td        |    6 +-
 .../clang/CIR/Dialect/IR/CIREnumAttr.td       |   38 +
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  360 +-
 clang/include/clang/CIR/Dialect/IR/CIRTypes.h |    6 +-
 .../include/clang/CIR/Dialect/IR/CIRTypes.td  |    6 +-
 clang/include/clang/CIR/Dialect/Passes.h      |    1 +
 clang/include/clang/CIR/Dialect/Passes.td     |   19 +-
 clang/include/clang/CIR/MissingFeatures.h     |    1 +
 clang/include/clang/Driver/Options.td         |   42 +-
 clang/include/clang/Format/Format.h           |   12 +-
 clang/include/clang/Frontend/ASTUnit.h        |    2 +
 .../include/clang/Frontend/CompilerInstance.h |    1 +
 clang/include/clang/Sema/DeclSpec.h           |   27 +-
 clang/include/clang/Sema/ParsedAttr.h         |    4 +-
 .../include/clang/Serialization/ASTBitCodes.h |    5 +-
 clang/include/clang/Serialization/ASTReader.h |   39 +-
 clang/include/clang/Serialization/ASTWriter.h |   21 +-
 .../Core/PathSensitive/ExprEngine.h           |    7 +-
 .../Refactoring/RecursiveSymbolVisitor.h      |    3 +-
 clang/lib/AST/ASTContext.cpp                  |  143 +-
 clang/lib/AST/ASTImporter.cpp                 |   14 +-
 clang/lib/AST/ASTStructuralEquivalence.cpp    |    3 -
 clang/lib/AST/ByteCode/Compiler.cpp           |  190 +-
 clang/lib/AST/ByteCode/Compiler.h             |    4 +-
 clang/lib/AST/ByteCode/Interp.cpp             |   17 +-
 clang/lib/AST/ByteCode/Interp.h               |  209 +-
 clang/lib/AST/ByteCode/InterpBlock.h          |    5 -
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |    1 -
 clang/lib/AST/ByteCode/Opcodes.td             |   28 +-
 clang/lib/AST/ByteCode/Pointer.cpp            |   11 +-
 clang/lib/AST/ByteCode/State.h                |    1 +
 clang/lib/AST/Decl.cpp                        |    9 +-
 clang/lib/AST/DeclCXX.cpp                     |   20 +-
 clang/lib/AST/ExprConstant.cpp                |  125 +-
 clang/lib/AST/ItaniumMangle.cpp               |   14 +-
 clang/lib/AST/NestedNameSpecifier.cpp         |   80 +-
 clang/lib/AST/ODRHash.cpp                     |    3 -
 clang/lib/AST/QualTypeNames.cpp               |   11 +-
 clang/lib/AST/StmtPrinter.cpp                 |    2 +-
 clang/lib/AST/TextNodeDumper.cpp              |    4 -
 clang/lib/ASTMatchers/Dynamic/Parser.cpp      |   10 +
 clang/lib/Analysis/LifetimeSafety.cpp         |  405 +-
 clang/lib/Analysis/UninitializedValues.cpp    |   24 +-
 clang/lib/Analysis/UnsafeBufferUsage.cpp      |  210 +-
 clang/lib/Basic/CMakeLists.txt                |    1 -
 clang/lib/Basic/CodeGenOptions.cpp            |    6 +-
 clang/lib/Basic/OpenMPKinds.cpp               |    2 +-
 clang/lib/Basic/Targets.cpp                   |   12 -
 clang/lib/Basic/Targets/ARM.cpp               |    3 -
 clang/lib/Basic/Targets/OSTargets.h           |   47 -
 clang/lib/Basic/Targets/PNaCl.cpp             |   29 -
 clang/lib/Basic/Targets/PNaCl.h               |   90 -
 clang/lib/Basic/Targets/SPIR.h                |    6 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |   16 -
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       |   40 +-
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         |    2 +-
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp   |   58 +-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |    8 +
 clang/lib/CIR/CodeGen/CIRGenValue.h           |    6 +
 .../lib/CIR/Dialect/Transforms/CMakeLists.txt |    1 +
 .../Dialect/Transforms/LoweringPrepare.cpp    |   95 +
 clang/lib/CIR/Lowering/CIRPasses.cpp          |    2 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |   16 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |   10 +
 clang/lib/CodeGen/BackendUtil.cpp             |    8 +-
 clang/lib/CodeGen/CGBlocks.cpp                |   25 +-
 clang/lib/CodeGen/CGCoroutine.cpp             |   14 +-
 clang/lib/CodeGen/CGDebugInfo.cpp             |   56 +-
 clang/lib/CodeGen/CGException.cpp             |   51 +-
 clang/lib/CodeGen/CGLoopInfo.cpp              |   46 +-
 clang/lib/CodeGen/CGObjC.cpp                  |   52 +-
 clang/lib/CodeGen/CGObjCGNU.cpp               |    5 +-
 clang/lib/CodeGen/CGObjCMac.cpp               |   94 +-
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |    4 +-
 clang/lib/CodeGen/CMakeLists.txt              |    1 -
 clang/lib/CodeGen/CodeGenAction.cpp           |    2 +-
 clang/lib/CodeGen/CodeGenFunction.cpp         |    3 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |    8 +-
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  207 +-
 clang/lib/CodeGen/TargetBuiltins/PPC.cpp      |    5 +
 clang/lib/CodeGen/TargetBuiltins/RISCV.cpp    |    6 +
 clang/lib/CodeGen/TargetBuiltins/SPIR.cpp     |   12 +
 clang/lib/CodeGen/TargetInfo.h                |    3 -
 clang/lib/CodeGen/Targets/PNaCl.cpp           |  114 -
 clang/lib/CodeGen/Targets/X86.cpp             |   35 +-
 clang/lib/Driver/CMakeLists.txt               |    1 -
 clang/lib/Driver/Driver.cpp                   |    4 -
 clang/lib/Driver/ToolChains/Clang.cpp         |   16 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |   11 +
 clang/lib/Driver/ToolChains/Gnu.cpp           |   29 +
 clang/lib/Driver/ToolChains/MinGW.cpp         |    4 +-
 clang/lib/Driver/ToolChains/NaCl.cpp          |  370 -
 clang/lib/Driver/ToolChains/NaCl.h            |   88 -
 clang/lib/ExtractAPI/DeclarationFragments.cpp |   15 +-
 clang/lib/Format/Format.cpp                   |   45 +-
 clang/lib/Format/TokenAnnotator.cpp           |   10 +-
 clang/lib/Frontend/ASTUnit.cpp                |   36 +-
 clang/lib/Frontend/ChainedIncludesSource.cpp  |    5 +-
 clang/lib/Frontend/CompilerInstance.cpp       |    8 +-
 clang/lib/Frontend/CompilerInvocation.cpp     |  128 +-
 clang/lib/Frontend/FrontendAction.cpp         |    5 +-
 clang/lib/Frontend/FrontendActions.cpp        |   17 +-
 clang/lib/Frontend/InitPreprocessor.cpp       |   35 +-
 clang/lib/Frontend/PrecompiledPreamble.cpp    |    8 +-
 clang/lib/Headers/CMakeLists.txt              |   23 +-
 clang/lib/Headers/__clang_spirv_builtins.h    |   24 +-
 .../Headers/cuda_wrappers/__utility/declval.h |   28 +
 .../lib/Headers/hlsl/hlsl_intrinsic_helpers.h |   10 +
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |   59 +
 clang/lib/Headers/ptrauth.h                   |   57 +
 clang/lib/Headers/riscv_nds.h                 |   35 +
 clang/lib/Index/IndexTypeSourceInfo.cpp       |    4 -
 clang/lib/Lex/InitHeaderSearch.cpp            |    1 -
 clang/lib/Parse/ParseDeclCXX.cpp              |   10 +-
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  194 +-
 clang/lib/Sema/DeclSpec.cpp                   |   15 +-
 clang/lib/Sema/SemaAMDGPU.cpp                 |    1 +
 clang/lib/Sema/SemaCXXScopeSpec.cpp           |    6 +-
 clang/lib/Sema/SemaConcept.cpp                |    5 +
 clang/lib/Sema/SemaDecl.cpp                   |    4 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |    2 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |   22 +-
 clang/lib/Sema/SemaDeclObjC.cpp               |    8 +
 clang/lib/Sema/SemaExprCXX.cpp                |    5 +-
 clang/lib/Sema/SemaHLSL.cpp                   |    2 +-
 clang/lib/Sema/SemaInit.cpp                   |    6 +-
 clang/lib/Sema/SemaLookup.cpp                 |   16 +-
 clang/lib/Sema/SemaM68k.cpp                   |    2 +-
 clang/lib/Sema/SemaMSP430.cpp                 |    2 +-
 clang/lib/Sema/SemaObjCProperty.cpp           |    9 +
 clang/lib/Sema/SemaOpenACC.cpp                |   70 +-
 clang/lib/Sema/SemaRISCV.cpp                  |    2 +-
 clang/lib/Sema/SemaSPIRV.cpp                  |   80 +
 clang/lib/Sema/SemaTemplate.cpp               |    1 -
 clang/lib/Sema/SemaTemplateDeduction.cpp      |    8 +-
 clang/lib/Sema/SemaType.cpp                   |    6 +-
 clang/lib/Sema/TreeTransform.h                |   25 +-
 clang/lib/Serialization/ASTReader.cpp         |  139 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |    2 +-
 clang/lib/Serialization/ASTWriter.cpp         |   26 +-
 clang/lib/Serialization/GeneratePCH.cpp       |    6 +-
 .../Checkers/WebKit/ASTUtils.cpp              |    5 +-
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |   46 +-
 clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp |   48 -
 clang/lib/StaticAnalyzer/Core/MemRegion.cpp   |   30 +-
 clang/lib/Tooling/Syntax/BuildTree.cpp        |    1 -
 clang/test/AST/ByteCode/complex.cpp           |    6 +-
 clang/test/AST/ByteCode/const-eval.c          |    2 +
 clang/test/AST/ByteCode/cxx11.cpp             |    4 +-
 clang/test/AST/ByteCode/cxx23.cpp             |   19 +
 clang/test/AST/ByteCode/placement-new.cpp     |    8 +
 clang/test/AST/ByteCode/records.cpp           |   10 +-
 clang/test/AST/ByteCode/unions.cpp            |  141 +-
 .../Checkers/WebKit/unretained-call-args.mm   |    4 +
 clang/test/Analysis/div-zero-cxx20.cpp        |   61 +
 clang/test/Analysis/div-zero.cpp              |   60 +
 clang/test/Analysis/stream.c                  |   28 +-
 clang/test/CIR/CodeGen/builtin_bit.cpp        |  138 +
 clang/test/CIR/CodeGen/complex-builtins.cpp   |   36 +
 clang/test/CIR/CodeGen/complex-unary.cpp      |  286 +
 clang/test/CXX/drs/cwg14xx.cpp                |    2 +
 clang/test/CXX/drs/cwg8xx.cpp                 |    4 +-
 clang/test/CXX/expr/expr.const/p2-0x.cpp      |   10 +-
 .../dcl.module/dcl.module.interface/p1.cppm   |    4 +-
 clang/test/CXX/module/module.interface/p1.cpp |   44 +-
 clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c |   16 +
 .../PowerPC/builtins-ppc-fpconstrained.c      |    2 +-
 .../PowerPC/ppc-dmf-future-builtin-err.c      |   15 +
 ...uiltin-err.c => ppc-dmf-mma-builtin-err.c} |    6 +
 .../CodeGen/RISCV/riscv-xandesbfhcvt-c-api.c  |   25 +
 clang/test/CodeGen/RISCV/riscv-xandesbfhcvt.c |   23 +
 .../non-overloaded/{vcpopv.c => vcpop.c}      |    0
 .../overloaded/{vcpopv.c => vcpop.c}          |    0
 .../non-overloaded/{vcpopv.c => vcpop.c}      |    0
 .../policy/overloaded/{vcpopv.c => vcpop.c}   |    0
 clang/test/CodeGen/X86/i128-debuginfo.c       |   10 +
 .../test/CodeGen/X86/x86_64-arguments-nacl.c  |   92 -
 clang/test/CodeGen/X86/x86_64-arguments.c     |   39 +
 clang/test/CodeGen/X86/x86_64-longdouble.c    |   16 -
 clang/test/CodeGen/alloc-align-attr.c         |   58 +-
 clang/test/CodeGen/arm-aapcs-vfp.c            |    6 -
 clang/test/CodeGen/builtins.c                 |   18 +-
 clang/test/CodeGen/ext-int-cc.c               |   13 +-
 clang/test/CodeGen/extend-arg-64.c            |    2 +-
 clang/test/CodeGen/long_double_fp128.cpp      |    4 -
 clang/test/CodeGen/malign-double-x86-nacl.c   |   43 -
 .../CodeGen/new-pass-manager-opt-bisect.c     |    2 +-
 .../null-sanitizer-debug-info-regression.cpp  |    5 +
 clang/test/CodeGen/target-data.c              |   16 -
 clang/test/CodeGenCXX/pragma-loop.cpp         |   41 +
 .../CodeGenCXX/x86_64-arguments-nacl-x32.cpp  |   57 -
 clang/test/CodeGenCoroutines/coro-gro.cpp     |   29 +-
 clang/test/CodeGenHLSL/builtins/refract.hlsl  |  244 +
 clang/test/CodeGenObjC/arc.m                  |    8 +-
 clang/test/CodeGenObjC/gnustep2-class-exts.m  |   25 +
 .../test/CodeGenObjC/matrix-type-operators.m  |   10 +-
 .../test/CodeGenObjC/ptrauth-attr-exception.m |   17 +
 clang/test/CodeGenObjC/ptrauth-block-isa.m    |   40 +
 clang/test/CodeGenObjC/ptrauth-class-ro.m     |   24 +
 clang/test/CodeGenObjC/ptrauth-class.m        |  103 +
 .../ptrauth-objc-interface-selector.m         |  130 +
 .../test/CodeGenObjC/ptrauth-objc-isa-super.m |   57 +
 .../ptrauth-objc-method-list-pointer.m        |   17 +
 .../CodeGenObjC/ptrauth-property-backing.m    |   80 +
 .../ptrauth-objc-interface-selector.mm        |   90 +
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |    2 +-
 .../builtins-amdgcn-gfx1250-wmma-w32.cl       |  433 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |  153 +
 .../builtins-amdgcn-raw-buffer-load-lds.cl    |    9 +
 clang/test/CodeGenOpenCL/scoped-atomic.cl     |   18 +
 clang/test/CodeGenSPIRV/Builtins/refract.c    |   74 +
 .../KeyInstructions/array-cookie.cpp          |   12 +
 clang/test/Driver/DTLTO/dtlto.c               |   48 +
 clang/test/Driver/arm-alignment.c             |    6 -
 clang/test/Driver/frame-pointer-elim.c        |    2 +-
 clang/test/Driver/mingw.cpp                   |    4 +
 clang/test/Driver/nacl-direct.c               |  146 -
 clang/test/Driver/openacc.c                   |   12 -
 .../Driver/print-supported-extensions-riscv.c |    1 +
 clang/test/Driver/riscv-cpus.c                |   34 +-
 clang/test/Driver/unsupported-target-arch.c   |    4 -
 clang/test/Driver/x86_64-nacl-defines.cpp     |   45 -
 clang/test/Format/multiple-inputs-error.cpp   |    2 +-
 clang/test/Format/ranges.cpp                  |   11 +-
 clang/test/Frontend/x86_64-nacl-types.cpp     |   37 -
 clang/test/Headers/spirv_ids.cpp              |  104 +-
 clang/test/Misc/warning-wall.c                |    1 +
 clang/test/Modules/cxx20-10-2-ex1.cpp         |    2 +-
 clang/test/Modules/cxx20-export-import.cpp    |    2 +-
 .../Modules/cxx20-import-diagnostics-a.cpp    |    2 +-
 clang/test/Modules/export-in-non-modules.cpp  |    2 +-
 clang/test/PCH/no-validate-pch.cl             |    2 +-
 clang/test/Parser/gh114815.cpp                |    6 +
 clang/test/Preprocessor/openacc.c             |    8 +-
 .../predefined-macros-no-warnings.c           |    4 -
 .../riscv-target-features-andes.c             |    9 +
 clang/test/Sema/attr-nonblocking-sema.cpp     |   22 +-
 clang/test/Sema/const-eval.c                  |    5 +-
 .../Sema/warn-lifetime-safety-dataflow.cpp    |   30 +-
 .../SemaCXX/constant-expression-cxx11.cpp     |    4 +-
 .../SemaCXX/constant-expression-cxx14.cpp     |  126 +-
 .../SemaCXX/constant-expression-cxx2a.cpp     |    2 +-
 .../SemaCXX/constant-expression-p2280r4.cpp   |   58 +-
 .../SemaCXX/constexpr-backtrace-limit.cpp     |    4 +-
 .../test/SemaCXX/constexpr-never-constant.cpp |    7 +
 clang/test/SemaCXX/cxx2b-deducing-this.cpp    |   10 +
 .../SemaCXX/cxx2c-constexpr-placement-new.cpp |    1 +
 clang/test/SemaCXX/noreturn-vars.cpp          |  227 +
 ...re-class-scoped-mismatched-constraints.cpp |   10 +-
 clang/test/SemaCXX/type-aware-coroutines.cpp  |    2 +-
 .../test/SemaCXX/type-aware-new-constexpr.cpp |   16 +-
 .../SemaCXX/type-aware-new-delete-arrays.cpp  |    8 +-
 ...are-new-delete-basic-free-declarations.cpp |    8 +-
 ...new-delete-basic-in-class-declarations.cpp |   70 +-
 ...type-aware-new-delete-basic-resolution.cpp |    8 +-
 .../type-aware-new-delete-qualifiers.cpp      |    8 +-
 ...-aware-new-delete-transparent-contexts.cpp |    6 +-
 .../type-aware-new-invalid-type-identity.cpp  |   12 +-
 .../type-aware-placement-operators.cpp        |    8 +-
 clang/test/SemaCXX/uninitialized.cpp          |    4 +
 .../SemaCXX/warn-thread-safety-analysis.cpp   |   14 +
 .../warn-uninitialized-const-pointer.cpp      |   35 +
 ...arn-unsafe-buffer-usage-libc-functions.cpp |    3 +
 .../SemaHLSL/BuiltIns/refract-errors.hlsl     |   66 +
 clang/test/SemaObjC/ptrauth-pointers.m        |   46 +
 clang/test/SemaObjC/ptrauth-qualifier.m       |   39 +-
 ...te_firstprivate_reduction_required_ops.cpp |  103 +
 ...ins-amdgcn-error-gfx1250-wmma-w32-param.cl |  242 +
 ...ns-amdgcn-raw-ptr-buffer-load-lds-error.cl |    7 +
 ...cn-raw-ptr-buffer-load-lds-target-error.cl |    3 +-
 .../test/SemaSPIRV/BuiltIns/refract-errors.c  |   41 +
 .../test/SemaTemplate/concepts-using-decl.cpp |   21 +
 clang/test/SemaTemplate/gh138371.cpp          |   22 +
 clang/tools/clang-format/ClangFormat.cpp      |   10 +-
 clang/tools/libclang/CIndex.cpp               |   11 -
 clang/tools/libclang/CMakeLists.txt           |    1 +
 clang/tools/libclang/Obsolete.cpp             |   48 +
 clang/tools/libclang/libclang.map             |    7 +-
 .../ASTMatchers/ASTMatchersNarrowingTest.cpp  |   13 +
 clang/unittests/Format/ConfigParseTest.cpp    |   22 +-
 clang/unittests/Format/SortIncludesTest.cpp   |   20 +
 clang/unittests/Format/TokenAnnotatorTest.cpp |    4 +
 .../NestedNameSpecifiers.cpp                  |    4 +-
 clang/unittests/Tooling/RefactoringTest.cpp   |    3 +-
 cmake/Modules/LLVMVersion.cmake               |    2 +-
 compiler-rt/lib/asan/asan_descriptions.cpp    |   10 +
 .../scudo/standalone/tests/common_test.cpp    |   70 +-
 compiler-rt/lib/tysan/lit.cfg                 |    2 +-
 .../asan/TestCases/Darwin/lit.local.cfg.py    |    2 +-
 .../asan/TestCases/Linux/lit.local.cfg.py     |    2 +-
 .../asan/TestCases/Posix/lit.local.cfg.py     |    2 +-
 .../asan/TestCases/Windows/lit.local.cfg.py   |    2 +-
 compiler-rt/test/asan/lit.cfg.py              |   26 +-
 compiler-rt/test/asan_abi/lit.cfg.py          |    2 +-
 .../TestCases/Darwin/lit.local.cfg.py         |    2 +-
 compiler-rt/test/builtins/Unit/lit.cfg.py     |    6 +-
 compiler-rt/test/builtins/lit.cfg.py          |    2 +-
 .../test/cfi/cross-dso/lit.local.cfg.py       |    2 +-
 compiler-rt/test/ctx_profile/lit.cfg.py       |    2 +-
 compiler-rt/test/dfsan/lit.cfg.py             |    2 +-
 compiler-rt/test/fuzzer/lit.cfg.py            |    2 +-
 compiler-rt/test/gwp_asan/lit.cfg.py          |    2 +-
 .../hwasan/TestCases/Linux/lit.local.cfg.py   |    2 +-
 .../hwasan/TestCases/Posix/lit.local.cfg.py   |    2 +-
 compiler-rt/test/hwasan/lit.cfg.py            |    2 +-
 compiler-rt/test/lit.common.cfg.py            |   46 +-
 compiler-rt/test/lit.common.configured.in     |    2 +-
 .../lsan/TestCases/Darwin/lit.local.cfg.py    |    2 +-
 .../lsan/TestCases/Linux/lit.local.cfg.py     |    2 +-
 .../lsan/TestCases/Posix/lit.local.cfg.py     |    2 +-
 compiler-rt/test/lsan/lit.common.cfg.py       |   12 +-
 compiler-rt/test/memprof/lit.cfg.py           |    2 +-
 compiler-rt/test/metadata/lit.cfg.py          |    2 +-
 compiler-rt/test/msan/Linux/lit.local.cfg.py  |    2 +-
 compiler-rt/test/msan/lit.cfg.py              |    6 +-
 compiler-rt/test/nsan/lit.cfg.py              |    2 +-
 .../orc/TestCases/Darwin/lit.local.cfg.py     |    2 +-
 .../orc/TestCases/FreeBSD/lit.local.cfg.py    |    2 +-
 .../test/orc/TestCases/Linux/lit.local.cfg.py |    2 +-
 .../orc/TestCases/Windows/lit.local.cfg.py    |    2 +-
 compiler-rt/test/orc/lit.cfg.py               |   12 +-
 compiler-rt/test/orc/lit.site.cfg.py.in       |    3 +-
 compiler-rt/test/profile/AIX/lit.local.cfg.py |    2 +-
 .../test/profile/Darwin/lit.local.cfg.py      |    2 +-
 .../test/profile/Linux/lit.local.cfg.py       |    2 +-
 .../test/profile/Posix/lit.local.cfg.py       |    4 +-
 .../test/profile/Windows/lit.local.cfg.py     |    2 +-
 compiler-rt/test/profile/lit.cfg.py           |   10 +-
 .../test/rtsan/Unit/lit.site.cfg.py.in        |    2 +-
 compiler-rt/test/rtsan/lit.cfg.py             |    8 +-
 compiler-rt/test/safestack/lit.cfg.py         |    2 +-
 .../TestCases/Darwin/lit.local.cfg.py         |    2 +-
 .../TestCases/FreeBSD/lit.local.cfg.py        |    2 +-
 .../TestCases/Linux/lit.local.cfg.py          |    2 +-
 .../TestCases/NetBSD/lit.local.cfg.py         |    2 +-
 .../TestCases/Posix/lit.local.cfg.py          |    2 +-
 .../test/sanitizer_common/lit.common.cfg.py   |   10 +-
 compiler-rt/test/scudo/lit.cfg.py             |    2 +-
 compiler-rt/test/shadowcallstack/lit.cfg.py   |    2 +-
 compiler-rt/test/tsan/Darwin/lit.local.cfg.py |    2 +-
 compiler-rt/test/tsan/Linux/lit.local.cfg.py  |    2 +-
 compiler-rt/test/tsan/Unit/lit.site.cfg.py.in |    2 +-
 compiler-rt/test/tsan/libcxx/lit.local.cfg.py |    2 +-
 .../test/tsan/libdispatch/lit.local.cfg.py    |    2 +-
 compiler-rt/test/tsan/lit.cfg.py              |   10 +-
 compiler-rt/test/tysan/lit.cfg.py             |    8 +-
 .../TestCases/Misc/Posix/lit.local.cfg.py     |    2 +-
 .../TypeCheck/Function/lit.local.cfg.py       |    2 +-
 .../TypeCheck/Linux/lit.local.cfg.py          |    2 +-
 compiler-rt/test/ubsan/lit.common.cfg.py      |    4 +-
 .../test/ubsan_minimal/lit.common.cfg.py      |    2 +-
 compiler-rt/test/xray/lit.cfg.py              |    8 +-
 compiler-rt/unittests/lit.common.unit.cfg.py  |    2 +-
 .../unittests/lit.common.unit.configured.in   |    2 +-
 .../dexter/dex/tools/TestToolBase.py          |   15 +-
 cross-project-tests/dtlto/ld-dtlto.c          |   31 +-
 cross-project-tests/lit.cfg.py                |   14 +-
 cross-project-tests/lit.site.cfg.py.in        |    4 +
 .../include/flang-rt/runtime/non-tbp-dio.h    |    7 +-
 flang-rt/include/flang-rt/runtime/type-info.h |   16 +-
 flang-rt/lib/cuda/descriptor.cpp              |    9 +
 flang-rt/lib/runtime/derived.cpp              |    2 +-
 flang-rt/lib/runtime/descriptor-io.cpp        |   88 +-
 flang-rt/lib/runtime/extensions.cpp           |    8 +-
 flang-rt/lib/runtime/non-tbp-dio.cpp          |    2 +-
 flang-rt/lib/runtime/type-info.cpp            |    2 +-
 .../unittests/Runtime/CUDA/AllocatorCUF.cpp   |   10 +
 flang/docs/Extensions.md                      |   18 +-
 flang/docs/Intrinsics.md                      |    3 +-
 .../docs/ParallelMultiImageFortranRuntime.md  |   18 +
 flang/docs/index.md                           |    1 +
 flang/include/flang/Common/format.h           |   79 +-
 .../flang/Optimizer/Builder/IntrinsicCall.h   |    4 +-
 .../Builder/Runtime/CUDA/Descriptor.h         |    4 +
 .../flang/Optimizer/Dialect/CUF/CUFOps.td     |   21 +
 flang/include/flang/Parser/message.h          |    4 +-
 flang/include/flang/Parser/parse-tree.h       |   10 +
 flang/include/flang/Runtime/CUDA/descriptor.h |    4 +
 flang/include/flang/Runtime/extensions.h      |    9 +-
 .../flang/Semantics/runtime-type-info.h       |    7 +-
 flang/include/flang/Semantics/tools.h         |    2 +
 flang/lib/Evaluate/intrinsics.cpp             |    1 +
 flang/lib/Frontend/FrontendAction.cpp         |    9 +-
 flang/lib/Lower/Bridge.cpp                    |   24 +-
 flang/lib/Lower/ConvertCall.cpp               |    5 +-
 flang/lib/Lower/ConvertConstant.cpp           |    4 +-
 flang/lib/Lower/ConvertExpr.cpp               |    7 +-
 flang/lib/Lower/ConvertExprToHLFIR.cpp        |    2 +-
 flang/lib/Lower/ConvertVariable.cpp           |  109 +-
 flang/lib/Lower/HostAssociations.cpp          |    6 +-
 flang/lib/Lower/IO.cpp                        |   10 +-
 flang/lib/Lower/OpenACC.cpp                   |  122 +-
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp |   57 +-
 flang/lib/Lower/OpenMP/DataSharingProcessor.h |   12 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |   84 -
 flang/lib/Lower/OpenMP/Utils.cpp              |   84 +
 flang/lib/Lower/OpenMP/Utils.h                |    3 +
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    |    2 +-
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |   51 +-
 .../Optimizer/Builder/LowLevelIntrinsics.cpp  |   13 +-
 flang/lib/Optimizer/Builder/MutableBox.cpp    |   28 +-
 .../Builder/Runtime/CUDA/Descriptor.cpp       |   15 +
 flang/lib/Optimizer/CodeGen/CMakeLists.txt    |    1 +
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |   27 +-
 .../Optimizer/CodeGen/LowerRepackArrays.cpp   |   56 +-
 flang/lib/Optimizer/CodeGen/Target.cpp        |   25 +-
 flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp    |   11 +
 flang/lib/Optimizer/Dialect/FIROps.cpp        |    2 +-
 .../HLFIR/Transforms/BufferizeHLFIR.cpp       |    2 +-
 .../Optimizer/Transforms/CUFDeviceGlobal.cpp  |   42 +-
 .../Optimizer/Transforms/CUFOpConversion.cpp  |   33 +-
 flang/lib/Parser/message.cpp                  |   17 +-
 flang/lib/Parser/openmp-parsers.cpp           |    9 +-
 flang/lib/Parser/parse-tree.cpp               |   47 +
 flang/lib/Parser/prescan.cpp                  |    4 +-
 flang/lib/Parser/program-parsers.cpp          |    2 +-
 flang/lib/Parser/type-parsers.h               |    1 +
 flang/lib/Semantics/check-declarations.cpp    |   48 +-
 flang/lib/Semantics/check-omp-structure.cpp   |    3 +-
 flang/lib/Semantics/expression.cpp            |   66 +-
 flang/lib/Semantics/resolve-labels.cpp        |   25 +-
 flang/lib/Semantics/resolve-names.cpp         |    6 +-
 flang/lib/Semantics/runtime-type-info.cpp     |   28 +-
 flang/lib/Semantics/semantics.cpp             |    5 +-
 flang/lib/Semantics/tools.cpp                 |    6 +-
 flang/module/__fortran_type_info.f90          |    2 +-
 flang/module/cudadevice.f90                   |   10 +
 flang/test/Driver/cuda-option.f90             |    2 +-
 flang/test/Driver/fatal-errors-warnings.f90   |   31 +
 flang/test/Driver/unparse-use-analyzed.f95    |    4 +-
 flang/test/Driver/unparse-with-modules.f90    |    2 +-
 flang/test/Fir/CUDA/cuda-alloc-free.fir       |   15 +
 flang/test/Fir/CUDA/cuda-device-global.f90    |   23 +
 flang/test/Fir/alloc-32.fir                   |    4 +-
 flang/test/Fir/alloc.fir                      |   48 +-
 flang/test/Fir/arrexp.fir                     |    4 +-
 flang/test/Fir/convert-to-llvm.fir            |   32 +-
 .../test/Integration/debug-common-block-1.f90 |    2 +-
 flang/test/Integration/debug-local-var-2.f90  |    2 +-
 flang/test/Lower/CUDA/cuda-derived.cuf        |    2 +-
 flang/test/Lower/CUDA/cuda-device-proc.cuf    |    7 +-
 flang/test/Lower/CUDA/cuda-return01.cuf       |    2 +-
 flang/test/Lower/CUDA/cuda-return02.cuf       |    2 +-
 flang/test/Lower/CUDA/cuda-set-allocator.cuf  |   24 +
 .../Lower/HLFIR/intrinsic-subroutines.f90     |    2 +-
 ...ointer-component-structure-constructor.f90 |    2 +-
 flang/test/Lower/Intrinsics/cospi.f90         |   22 +
 flang/test/Lower/OpenACC/acc-atomic-read.f90  |    2 +-
 flang/test/Lower/OpenACC/acc-atomic-write.f90 |    2 +-
 .../acc-host-data-unwrap-defaultbounds.f90    |   14 +-
 flang/test/Lower/OpenACC/acc-host-data.f90    |   21 +-
 flang/test/Lower/OpenACC/acc-routine.f90      |    7 +-
 flang/test/Lower/OpenACC/acc-routine03.f90    |    2 +-
 flang/test/Lower/OpenACC/acc-routine04.f90    |    2 +-
 flang/test/Lower/OpenACC/acc-use-device.f90   |   61 +
 flang/test/Lower/OpenMP/atomic-read.f90       |    2 +-
 flang/test/Lower/OpenMP/atomic-write.f90      |    2 +-
 .../Lower/OpenMP/common-atomic-lowering.f90   |    2 +-
 flang/test/Lower/OpenMP/cray-pointers02.f90   |    2 +-
 .../Lower/OpenMP/default-clause-byref.f90     |    2 +-
 flang/test/Lower/OpenMP/default-clause.f90    |    2 +-
 .../parallel-reduction-allocatable-array.f90  |    2 +-
 .../OpenMP/parallel-reduction-array-lb.f90    |    2 +-
 .../OpenMP/parallel-reduction-array2.f90      |    2 +-
 .../Lower/OpenMP/parallel-reduction-byref.f90 |    2 +-
 .../parallel-reduction-pointer-array.f90      |    2 +-
 .../OpenMP/parallel-reduction-rename.f90      |    2 +-
 .../test/Lower/OpenMP/parallel-reduction.f90  |    2 +-
 flang/test/Lower/OpenMP/sections.f90          |    2 +-
 flang/test/Lower/OpenMP/taskgroup02.f90       |    5 +-
 .../threadprivate-host-association-2.f90      |    2 +-
 .../threadprivate-host-association-3.f90      |    2 +-
 .../OpenMP/threadprivate-host-association.f90 |    2 +-
 flang/test/Lower/OpenMP/wsloop-chunks.f90     |    2 +-
 flang/test/Lower/OpenMP/wsloop-collapse.f90   |    2 +-
 ...oop-reduction-allocatable-array-minmax.f90 |    2 +-
 .../OpenMP/wsloop-reduction-allocatable.f90   |    2 +-
 .../OpenMP/wsloop-reduction-array-lb.f90      |    2 +-
 .../OpenMP/wsloop-reduction-array-lb2.f90     |    2 +-
 .../Lower/OpenMP/wsloop-reduction-array.f90   |    2 +-
 .../Lower/OpenMP/wsloop-reduction-array2.f90  |    2 +-
 .../Lower/OpenMP/wsloop-reduction-min2.f90    |    2 +-
 .../wsloop-reduction-multiple-clauses.f90     |    2 +-
 .../Lower/OpenMP/wsloop-reduction-pointer.f90 |    2 +-
 flang/test/Lower/amdgcn-complex.f90           |   21 +
 flang/test/Lower/array-character.f90          |    2 +-
 flang/test/Lower/array-expression-slice-1.f90 |    2 +-
 flang/test/Lower/basic-program.f90            |    6 +-
 flang/test/Lower/big-integer-parameter.f90    |    2 +-
 .../test/Lower/derived-type-finalization.f90  |    2 +-
 flang/test/Lower/forall/character-1.f90       |    4 +-
 flang/test/Lower/io-derived-type.f90          |   58 +-
 flang/test/Lower/location.f90                 |    2 +-
 flang/test/Lower/namelist.f90                 |    8 +-
 flang/test/Lower/nested-where.f90             |    2 +-
 flang/test/Lower/polymorphic.f90              |    2 +-
 flang/test/Lower/pre-fir-tree02.f90           |    2 +-
 flang/test/Lower/pre-fir-tree03.f90           |    2 +-
 flang/test/Lower/pre-fir-tree06.f90           |    4 +-
 .../test/Lower/program-units-fir-mangling.f90 |    2 +-
 flang/test/Lower/return-statement.f90         |    2 +-
 flang/test/Lower/volatile-openmp.f90          |    8 +-
 flang/test/Lower/volatile-openmp1.f90         |    2 +-
 flang/test/Lower/volatile-string.f90          |    2 +-
 flang/test/Lower/volatile3.f90                |    2 +-
 flang/test/Parser/OpenMP/fail-construct1.f90  |    2 +-
 flang/test/Parser/acc-unparse.f90             |    2 +-
 .../Preprocessing/omp-sentinel-fixed-form.F   |   21 +
 .../test/Semantics/OpenACC/acc-symbols01.f90  |   26 +-
 .../OpenMP/critical_within_default.f90        |    2 +-
 .../OpenMP/declare-mapper-symbols.f90         |    2 +-
 .../OpenMP/declare-reduction-mangled.f90      |    2 +-
 .../OpenMP/declare-reduction-operators.f90    |    2 +-
 .../OpenMP/declare-reduction-renamedop.f90    |    2 +-
 .../Semantics/OpenMP/declare-reduction.f90    |    2 +-
 .../Semantics/OpenMP/declare-target03.f90     |    6 +-
 flang/test/Semantics/OpenMP/do-schedule03.f90 |   32 +-
 .../Semantics/OpenMP/do01-positivecase.f90    |   10 +-
 .../Semantics/OpenMP/do04-positivecase.f90    |   18 +-
 .../Semantics/OpenMP/do05-positivecase.f90    |   36 +-
 .../Semantics/OpenMP/do06-positivecases.f90   |   14 +-
 flang/test/Semantics/OpenMP/do11.f90          |   20 +-
 flang/test/Semantics/OpenMP/do12.f90          |   76 +-
 flang/test/Semantics/OpenMP/do14.f90          |   66 +-
 flang/test/Semantics/OpenMP/do17.f90          |   42 +-
 .../Semantics/OpenMP/map-clause-symbols.f90   |    2 +-
 flang/test/Semantics/OpenMP/reduction08.f90   |   52 +-
 flang/test/Semantics/OpenMP/reduction09.f90   |   42 +-
 flang/test/Semantics/OpenMP/reduction11.f90   |    2 +-
 flang/test/Semantics/OpenMP/scan2.f90         |    2 +-
 flang/test/Semantics/OpenMP/symbol01.f90      |   54 +-
 flang/test/Semantics/OpenMP/symbol05.f90      |    8 +-
 flang/test/Semantics/OpenMP/symbol07.f90      |    6 +-
 flang/test/Semantics/OpenMP/symbol09.f90      |    6 +-
 .../test/Semantics/OpenMP/threadprivate03.f90 |    6 +-
 flang/test/Semantics/bind-c18.f90             |    7 +
 flang/test/Semantics/bug148559.f90            |   12 +
 flang/test/Semantics/bug148675.f90            |   21 +
 flang/test/Semantics/getsymbols03-a.f90       |    2 +-
 flang/test/Semantics/long-name.f90            |    2 +-
 flang/test/Semantics/modproc01.f90            |    2 +-
 flang/test/Semantics/multi-programs04.f90     |    2 +-
 flang/test/Semantics/pointer01.f90            |    1 -
 flang/test/Semantics/procinterface01.f90      |   26 +-
 flang/test/Semantics/resolve05.f90            |    1 -
 flang/test/Semantics/resolve125.f90           |    4 +-
 flang/test/Semantics/symbol03.f90             |   20 +-
 flang/test/Semantics/symbol06.f90             |   72 +-
 flang/test/Semantics/symbol07.f90             |   48 +-
 flang/test/Semantics/symbol08.f90             |   16 +-
 flang/test/Semantics/symbol15.f90             |   12 +-
 flang/test/Semantics/symbol16.f90             |   20 +-
 flang/test/Semantics/symbol17.f90             |   52 +-
 flang/test/Semantics/symbol18.f90             |   22 +-
 flang/test/Semantics/symbol20.f90             |   14 +-
 flang/test/Semantics/symbol25.f90             |   14 +-
 flang/test/Semantics/symbol26.f90             |   10 +-
 flang/test/Semantics/typeinfo01.f90           |   18 +-
 flang/test/Semantics/typeinfo02.f90           |    4 +-
 flang/test/Semantics/typeinfo09.f90           |    2 +-
 flang/test/Semantics/typeinfo13.f90           |    2 +-
 .../Transforms/DoConcurrent/basic_host.f90    |    2 +-
 flang/test/Transforms/lower-repack-arrays.fir |  128 +-
 .../Optimizer/Builder/CharacterTest.cpp       |    2 +-
 .../Optimizer/Builder/ComplexTest.cpp         |    2 +-
 .../Optimizer/Builder/FIRBuilderTest.cpp      |    5 +-
 .../Optimizer/Builder/HLFIRToolsTest.cpp      |    2 +-
 .../Builder/Runtime/RuntimeCallTestBase.h     |    5 +-
 .../Optimizer/FortranVariableTest.cpp         |    5 +-
 libc/CMakeLists.txt                           |    7 +
 libc/cmake/modules/LLVMLibCTestRules.cmake    |   58 +-
 libc/config/baremetal/arm/entrypoints.txt     |    3 +
 libc/config/baremetal/arm/headers.txt         |    1 +
 libc/config/baremetal/riscv/entrypoints.txt   |    3 +
 libc/config/baremetal/riscv/headers.txt       |    1 +
 libc/config/darwin/aarch64/entrypoints.txt    |    3 +
 libc/config/darwin/aarch64/headers.txt        |    1 +
 libc/config/linux/aarch64/entrypoints.txt     |    3 +
 libc/config/linux/aarch64/headers.txt         |    1 +
 libc/config/linux/arm/entrypoints.txt         |    3 +
 libc/config/linux/arm/headers.txt             |    1 +
 libc/config/linux/riscv/entrypoints.txt       |    3 +
 libc/config/linux/riscv/headers.txt           |    1 +
 libc/config/linux/x86_64/entrypoints.txt      |    2 +
 libc/config/linux/x86_64/headers.txt          |    1 +
 libc/config/windows/entrypoints.txt           |    3 +
 libc/config/windows/headers.txt               |    1 +
 libc/docs/headers/search.rst                  |    4 +-
 libc/fuzzing/math/CMakeLists.txt              |   54 +-
 libc/fuzzing/math/acos_fuzz.cpp               |   42 +-
 libc/fuzzing/math/asin_fuzz.cpp               |   43 +-
 libc/fuzzing/math/atan_fuzz.cpp               |   38 -
 libc/fuzzing/math/cos_fuzz.cpp                |   47 +-
 libc/fuzzing/math/exp10_fuzz.cpp              |   41 +-
 libc/fuzzing/math/exp2_fuzz.cpp               |   41 +-
 libc/fuzzing/math/exp_fuzz.cpp                |   41 +-
 libc/fuzzing/math/expm1_fuzz.cpp              |   41 +-
 libc/fuzzing/math/log10_fuzz.cpp              |   51 +
 libc/fuzzing/math/log1p_fuzz.cpp              |   50 +
 libc/fuzzing/math/log2_fuzz.cpp               |   51 +
 libc/fuzzing/math/log_fuzz.cpp                |   50 +
 libc/fuzzing/math/sin_fuzz.cpp                |   47 +-
 libc/fuzzing/math/sincos_fuzz.cpp             |   55 +-
 libc/fuzzing/math/sqrt_fuzz.cpp               |   50 +
 libc/fuzzing/math/tan_fuzz.cpp                |   47 +-
 libc/include/CMakeLists.txt                   |   10 +-
 libc/include/dlfcn.yaml                       |   33 +-
 libc/include/llvm-libc-macros/dlfcn-macros.h  |   23 -
 libc/include/wctype.yaml                      |   10 +
 libc/shared/math.h                            |    3 +
 libc/shared/math/exp.h                        |   23 +
 libc/shared/math/exp10.h                      |   23 +
 libc/shared/math/exp10f.h                     |   23 +
 libc/src/CMakeLists.txt                       |    1 +
 libc/src/__support/CMakeLists.txt             |    2 +-
 libc/src/__support/FPUtil/PolyEval.h          |    2 +-
 libc/src/__support/FPUtil/double_double.h     |    4 +-
 libc/src/__support/FPUtil/multiply_add.h      |    2 +-
 libc/src/__support/FPUtil/nearest_integer.h   |    4 +-
 libc/src/__support/math/CMakeLists.txt        |   88 +
 libc/src/__support/math/exp.h                 |  448 +
 libc/src/__support/math/exp10.h               |  501 +
 .../exp10f_impl.h => __support/math/exp10f.h} |   17 +-
 libc/src/__support/math/exp10f_utils.h        |  157 +
 libc/src/__support/math/exp_constants.h       |  174 +
 libc/src/__support/math/exp_utils.h           |   72 +
 libc/src/__support/math/expf16_utils.h        |    2 +-
 libc/src/__support/wchar/CMakeLists.txt       |   14 +
 .../__support/wchar/character_converter.cpp   |   11 +
 .../src/__support/wchar/character_converter.h |    4 +
 libc/src/__support/wchar/string_converter.h   |  116 +
 libc/src/math/generic/CMakeLists.txt          |   74 +-
 libc/src/math/generic/atanhf.cpp              |    1 +
 libc/src/math/generic/common_constants.cpp    |  157 -
 libc/src/math/generic/common_constants.h      |    7 +-
 libc/src/math/generic/coshf.cpp               |    2 +-
 libc/src/math/generic/exp.cpp                 |  429 +-
 libc/src/math/generic/exp10.cpp               |  485 +-
 libc/src/math/generic/exp10f.cpp              |    7 +-
 libc/src/math/generic/explogxf.cpp            |   75 -
 libc/src/math/generic/explogxf.h              |  235 +-
 libc/src/math/generic/powf.cpp                |    7 +-
 libc/src/math/generic/sinhf.cpp               |    1 +
 libc/src/string/memory_utils/CMakeLists.txt   |    2 +
 libc/src/string/memory_utils/arm/common.h     |   55 +
 .../string/memory_utils/arm/inline_memcpy.h   |  193 +-
 .../string/memory_utils/arm/inline_memset.h   |  156 +
 libc/src/string/memory_utils/inline_memset.h  |    6 +-
 libc/src/wctype/CMakeLists.txt                |    9 +
 libc/src/wctype/iswalpha.cpp                  |   19 +
 libc/src/wctype/iswalpha.h                    |   21 +
 libc/test/src/CMakeLists.txt                  |    1 +
 libc/test/src/__support/wchar/CMakeLists.txt  |   15 +
 .../__support/wchar/string_converter_test.cpp |  423 +
 libc/test/src/math/cospif_test.cpp            |    2 +-
 libc/test/src/math/explogxf_test.cpp          |    5 -
 libc/test/src/math/sincosf_test.cpp           |    2 +-
 libc/test/src/math/sinpif_test.cpp            |    2 +-
 libc/test/src/wctype/CMakeLists.txt           |   11 +
 libc/test/src/wctype/iswalpha_test.cpp        |   54 +
 libclc/CMakeLists.txt                         |   25 +-
 libclc/clc/include/clc/atomic/atomic_decl.inc |   47 +
 .../clc/atomic/clc_atomic_compare_exchange.h  |   26 +
 .../clc/include/clc/atomic/clc_atomic_dec.h   |   23 +
 .../include/clc/atomic/clc_atomic_exchange.h  |   24 +
 .../include/clc/atomic/clc_atomic_fetch_add.h |   24 +
 .../include/clc/atomic/clc_atomic_fetch_and.h |   21 +
 .../include/clc/atomic/clc_atomic_fetch_max.h |   24 +
 .../include/clc/atomic/clc_atomic_fetch_min.h |   24 +
 .../include/clc/atomic/clc_atomic_fetch_or.h  |   21 +
 .../include/clc/atomic/clc_atomic_fetch_sub.h |   24 +
 .../include/clc/atomic/clc_atomic_fetch_xor.h |   21 +
 .../clc/include/clc/atomic/clc_atomic_inc.h   |   23 +
 .../clc/include/clc/atomic/clc_atomic_load.h  |   26 +
 .../clc/include/clc/atomic/clc_atomic_store.h |   26 +
 .../clc/include/clc/integer/clc_bit_reverse.h |   19 +
 .../clc/integer/clc_bitfield_extract_decl.inc |   10 +
 .../clc/integer/clc_bitfield_extract_signed.h |   23 +
 .../integer/clc_bitfield_extract_unsigned.h   |   23 +
 .../include/clc/integer/clc_bitfield_insert.h |   18 +
 .../clc/integer/clc_bitfield_insert.inc       |   11 +
 .../include/clc/relational/binary_decl.inc    |   10 +-
 .../clc/include/clc/relational/clc_isfinite.h |    2 +-
 .../include/clc/relational/clc_isgreater.h    |    2 +-
 .../clc/relational/clc_isgreaterequal.h       |    2 +-
 .../clc/include/clc/relational/clc_isless.h   |    2 +-
 .../include/clc/relational/clc_islessequal.h  |    2 +-
 .../clc/relational/clc_islessgreater.h        |    2 +-
 .../clc/include/clc/relational/clc_isnormal.h |    2 +-
 .../include/clc/relational/clc_isnotequal.h   |    2 +-
 .../include/clc/relational/clc_isordered.h    |    2 +-
 .../include/clc/relational/clc_isunordered.h  |    2 +-
 .../clc/include/clc/relational/clc_signbit.h  |    2 +-
 libclc/clc/include/clc/relational/floatn.inc  |  132 -
 .../clc/include/clc/relational/unary_decl.inc |   10 +-
 libclc/clc/lib/generic/SOURCES                |   17 +
 .../atomic/clc_atomic_compare_exchange.cl     |   15 +
 .../atomic/clc_atomic_compare_exchange.inc    |   64 +
 .../clc/lib/generic/atomic/clc_atomic_dec.cl  |   16 +
 .../clc/lib/generic/atomic/clc_atomic_def.inc |   79 +
 .../lib/generic/atomic/clc_atomic_exchange.cl |   23 +
 .../generic/atomic/clc_atomic_fetch_add.cl    |   18 +
 .../generic/atomic/clc_atomic_fetch_and.cl    |   15 +
 .../generic/atomic/clc_atomic_fetch_max.cl    |   18 +
 .../generic/atomic/clc_atomic_fetch_min.cl    |   18 +
 .../lib/generic/atomic/clc_atomic_fetch_or.cl |   15 +
 .../generic/atomic/clc_atomic_fetch_sub.cl    |   18 +
 .../generic/atomic/clc_atomic_fetch_xor.cl    |   15 +
 .../clc/lib/generic/atomic/clc_atomic_inc.cl  |   16 +
 .../clc/lib/generic/atomic/clc_atomic_load.cl |   24 +
 .../lib/generic/atomic/clc_atomic_store.cl    |   22 +
 .../lib/generic/integer/clc_bit_reverse.cl    |   15 +
 .../integer/clc_bitfield_extract_signed.cl    |   13 +-
 .../integer/clc_bitfield_extract_signed.inc   |   19 +
 .../integer/clc_bitfield_extract_unsigned.cl  |   12 +
 .../integer/clc_bitfield_extract_unsigned.inc |   16 +
 .../generic/integer/clc_bitfield_insert.cl    |   12 +
 .../generic/integer/clc_bitfield_insert.inc   |   20 +
 .../lib/generic/math/clc_native_divide.inc    |    1 +
 .../clc/lib/generic/math/clc_native_recip.inc |    1 +
 .../clc/lib/generic/math/clc_native_rsqrt.inc |    1 +
 .../clc/lib/generic/math/clc_native_tan.inc   |    1 +
 .../include/clc/opencl/atomic/atomic_add.h    |    2 +-
 .../include/clc/opencl/atomic/atomic_and.h    |    2 +-
 .../atomic/atomic_compare_exchange_strong.h   |   24 +
 .../atomic/atomic_compare_exchange_weak.h     |   24 +
 .../include/clc/opencl/atomic/atomic_decl.inc |   58 +-
 .../clc/opencl/atomic/atomic_decl_legacy.inc  |   22 +
 .../clc/opencl/atomic/atomic_exchange.h       |   22 +
 .../clc/opencl/atomic/atomic_fetch_add.h      |   22 +
 .../clc/opencl/atomic/atomic_fetch_and.h      |   19 +
 .../clc/opencl/atomic/atomic_fetch_max.h      |   22 +
 .../clc/opencl/atomic/atomic_fetch_min.h      |   22 +
 .../clc/opencl/atomic/atomic_fetch_or.h       |   19 +
 .../clc/opencl/atomic/atomic_fetch_sub.h      |   22 +
 .../clc/opencl/atomic/atomic_fetch_xor.h      |   19 +
 .../include/clc/opencl/atomic/atomic_load.h   |   24 +
 .../include/clc/opencl/atomic/atomic_max.h    |    2 +-
 .../include/clc/opencl/atomic/atomic_min.h    |    2 +-
 .../include/clc/opencl/atomic/atomic_or.h     |    2 +-
 .../include/clc/opencl/atomic/atomic_store.h  |   24 +
 .../include/clc/opencl/atomic/atomic_sub.h    |    2 +-
 .../include/clc/opencl/atomic/atomic_xchg.h   |    2 +-
 .../include/clc/opencl/atomic/atomic_xor.h    |    2 +-
 .../include/clc/opencl/integer/bit_reverse.h  |   25 +
 .../opencl/integer/bitfield_extract_signed.h  |   27 +
 .../integer/bitfield_extract_unsigned.h       |   27 +
 .../clc/opencl/integer/bitfield_insert.h      |   23 +
 .../include/clc/opencl/relational/isfinite.h  |    2 +-
 .../include/clc/opencl/relational/isgreater.h |    2 +-
 .../clc/opencl/relational/isgreaterequal.h    |    2 +-
 .../include/clc/opencl/relational/isless.h    |    2 +-
 .../clc/opencl/relational/islessequal.h       |    2 +-
 .../clc/opencl/relational/islessgreater.h     |    2 +-
 .../include/clc/opencl/relational/isnormal.h  |    2 +-
 .../clc/opencl/relational/isnotequal.h        |    2 +-
 .../include/clc/opencl/relational/isordered.h |    2 +-
 .../clc/opencl/relational/isunordered.h       |    2 +-
 .../include/clc/opencl/relational/signbit.h   |    2 +-
 libclc/opencl/lib/generic/SOURCES             |   30 +-
 .../atomic/atomic_compare_exchange_strong.cl  |   25 +
 .../atomic/atomic_compare_exchange_weak.cl    |   25 +
 .../opencl/lib/generic/atomic/atomic_dec.cl   |   14 +-
 .../opencl/lib/generic/atomic/atomic_def.inc  |   79 +
 .../lib/generic/atomic/atomic_exchange.cl     |   25 +
 .../lib/generic/atomic/atomic_fetch_add.cl    |   25 +
 .../lib/generic/atomic/atomic_fetch_and.cl    |   22 +
 .../lib/generic/atomic/atomic_fetch_max.cl    |   25 +
 .../lib/generic/atomic/atomic_fetch_min.cl    |   25 +
 .../lib/generic/atomic/atomic_fetch_or.cl     |   22 +
 .../lib/generic/atomic/atomic_fetch_sub.cl    |   25 +
 .../lib/generic/atomic/atomic_fetch_xor.cl    |   22 +
 .../opencl/lib/generic/atomic/atomic_inc.cl   |   14 +-
 .../lib/generic/atomic/atomic_inc_dec.inc     |   26 +
 .../opencl/lib/generic/atomic/atomic_load.cl  |   26 +
 .../opencl/lib/generic/atomic/atomic_store.cl |   26 +
 .../opencl/lib/generic/integer/bit_reverse.cl |   19 +
 .../generic/integer/bitfield_extract_def.inc  |   16 +
 .../integer/bitfield_extract_signed.cl        |   20 +
 .../integer/bitfield_extract_unsigned.cl      |   20 +
 .../lib/generic/integer/bitfield_insert.cl    |   18 +
 .../lib/generic/integer/bitfield_insert.inc   |   13 +
 .../lib/generic/relational/binary_def.inc     |   10 +-
 .../opencl/lib/generic/relational/isequal.cl  |    2 +-
 .../opencl/lib/generic/relational/isfinite.cl |    2 +-
 .../lib/generic/relational/isgreater.cl       |    2 +-
 .../lib/generic/relational/isgreaterequal.cl  |    2 +-
 libclc/opencl/lib/generic/relational/isinf.cl |    2 +-
 .../opencl/lib/generic/relational/isless.cl   |    2 +-
 .../lib/generic/relational/islessequal.cl     |    2 +-
 .../lib/generic/relational/islessgreater.cl   |    2 +-
 libclc/opencl/lib/generic/relational/isnan.cl |    2 +-
 .../opencl/lib/generic/relational/isnormal.cl |    2 +-
 .../lib/generic/relational/isnotequal.cl      |    2 +-
 .../lib/generic/relational/isordered.cl       |    2 +-
 .../lib/generic/relational/isunordered.cl     |    2 +-
 .../opencl/lib/generic/relational/signbit.cl  |    2 +-
 .../lib/generic/relational/unary_def.inc      |   10 +-
 libclc/utils/CMakeLists.txt                   |   24 +
 libcxx/docs/Status/Cxx17Issues.csv            |    4 +-
 libcxx/docs/Status/Cxx20Issues.csv            |    4 +-
 libcxx/docs/Status/Cxx2cIssues.csv            |   12 +-
 libcxx/docs/Status/Cxx2cPapers.csv            |    7 +-
 libcxx/docs/index.rst                         |    2 +-
 libcxx/include/CMakeLists.txt                 |    2 +-
 libcxx/include/__algorithm/simd_utils.h       |    4 +-
 libcxx/include/__config                       |   19 +-
 libcxx/include/__configuration/compiler.h     |    2 +-
 .../__format/range_default_formatter.h        |   42 +-
 libcxx/include/__format/range_format.h        |   71 +
 libcxx/include/__hash_table                   |  115 +-
 libcxx/include/__locale_dir/locale_base_api.h |    2 -
 .../__locale_dir/locale_base_api/android.h    |   45 -
 .../__memory_resource/polymorphic_allocator.h |    6 +-
 libcxx/include/__new/allocate.h               |   71 +-
 libcxx/include/__new/launder.h                |    8 +-
 libcxx/include/__tree                         |  114 +-
 .../__type_traits/is_core_convertible.h       |   13 +-
 libcxx/include/barrier                        |   14 +-
 libcxx/include/ext/hash_map                   |    4 +-
 libcxx/include/ext/hash_set                   |    4 +-
 libcxx/include/fstream                        |   35 +-
 libcxx/include/module.modulemap.in            |    2 +-
 libcxx/include/optional                       |   94 +-
 libcxx/include/string                         |   68 +-
 libcxx/include/unordered_map                  |    8 +-
 libcxx/src/barrier.cpp                        |    7 +-
 .../gnu/hash_multimap/insert.pass.cpp         |   35 +
 .../gnu/hash_multiset/insert.pass.cpp         |   35 +
 .../always_initialize_iterators.pass.cpp      |   53 +
 .../libcxx/localization/lit.local.cfg         |    3 +
 .../conversions.string/ctor_move.pass.cpp     |    0
 .../algorithms/vectorization.compile.pass.cpp |    3 -
 .../unord/key_value_traits.pass.cpp           |   60 -
 .../mem/mem.res/ctor.nullptr.assert.pass.cpp  |   30 +
 .../libcxx/mem/mem.res/nonnull.verify.cpp     |   17 +
 .../minimal_cxx11_configuration.pass.cpp      |  130 -
 .../assert.ctor.pass.cpp                      |    2 +
 .../assert.ctor.pass.cpp                      |    2 +
 .../assert.to_local.pass.cpp                  |    1 +
 .../time.zone.members/assert.to_sys.pass.cpp  |    2 +
 .../assert.to_sys_choose.pass.cpp             |    2 +
 .../vendor/apple/disable-availability.sh.cpp  |    4 -
 .../simd/simd.class/simd_unary.pass.cpp       |    3 -
 .../overflow.writefail.pass.cpp               |   72 +
 .../pointer.pass.cpp                          |    4 +-
 .../pointer.volatile.pass.cpp                 |    4 +-
 .../sized_delete_array.pass.cpp               |    1 -
 .../new.delete.single/sized_delete.pass.cpp   |    1 -
 .../ptr.launder/launder.types.verify.cpp      |    9 +-
 .../algorithm.version.compile.pass.cpp        |    1 -
 .../any.version.compile.pass.cpp              |    1 -
 .../array.version.compile.pass.cpp            |    1 -
 .../atomic.version.compile.pass.cpp           |    1 -
 .../barrier.version.compile.pass.cpp          |    1 -
 .../bit.version.compile.pass.cpp              |    1 -
 .../bitset.version.compile.pass.cpp           |    1 -
 .../charconv.version.compile.pass.cpp         |    1 -
 .../chrono.version.compile.pass.cpp           |    1 -
 .../cmath.version.compile.pass.cpp            |    1 -
 .../compare.version.compile.pass.cpp          |    1 -
 .../complex.version.compile.pass.cpp          |    1 -
 .../concepts.version.compile.pass.cpp         |    1 -
 .../coroutine.version.compile.pass.cpp        |    1 -
 .../cstddef.version.compile.pass.cpp          |    1 -
 .../cstdlib.version.compile.pass.cpp          |    1 -
 .../cstring.version.compile.pass.cpp          |    1 -
 .../deque.version.compile.pass.cpp            |    1 -
 .../exception.version.compile.pass.cpp        |    1 -
 .../execution.version.compile.pass.cpp        |    1 -
 .../expected.version.compile.pass.cpp         |    1 -
 .../filesystem.version.compile.pass.cpp       |    1 -
 .../flat_map.version.compile.pass.cpp         |    1 -
 .../flat_set.version.compile.pass.cpp         |    1 -
 .../format.version.compile.pass.cpp           |    1 -
 .../forward_list.version.compile.pass.cpp     |    1 -
 .../fstream.version.compile.pass.cpp          |    1 -
 .../functional.version.compile.pass.cpp       |    1 -
 .../iomanip.version.compile.pass.cpp          |    1 -
 .../ios.version.compile.pass.cpp              |    1 -
 .../istream.version.compile.pass.cpp          |    1 -
 .../iterator.version.compile.pass.cpp         |    1 -
 .../latch.version.compile.pass.cpp            |    1 -
 .../limits.version.compile.pass.cpp           |    1 -
 .../list.version.compile.pass.cpp             |    1 -
 .../locale.version.compile.pass.cpp           |    1 -
 .../map.version.compile.pass.cpp              |    1 -
 .../mdspan.version.compile.pass.cpp           |    1 -
 .../memory.version.compile.pass.cpp           |    1 -
 .../memory_resource.version.compile.pass.cpp  |    1 -
 .../mutex.version.compile.pass.cpp            |    1 -
 .../new.version.compile.pass.cpp              |    1 -
 .../numbers.version.compile.pass.cpp          |    1 -
 .../numeric.version.compile.pass.cpp          |    1 -
 .../optional.version.compile.pass.cpp         |    1 -
 .../ostream.version.compile.pass.cpp          |    1 -
 .../print.version.compile.pass.cpp            |    1 -
 .../queue.version.compile.pass.cpp            |    1 -
 .../random.version.compile.pass.cpp           |    1 -
 .../ranges.version.compile.pass.cpp           |    1 -
 .../ratio.version.compile.pass.cpp            |    1 -
 .../regex.version.compile.pass.cpp            |    1 -
 .../scoped_allocator.version.compile.pass.cpp |    1 -
 .../semaphore.version.compile.pass.cpp        |    1 -
 .../set.version.compile.pass.cpp              |    1 -
 .../shared_mutex.version.compile.pass.cpp     |    1 -
 .../source_location.version.compile.pass.cpp  |    1 -
 .../span.version.compile.pass.cpp             |    1 -
 .../sstream.version.compile.pass.cpp          |    1 -
 .../stack.version.compile.pass.cpp            |    1 -
 .../stdatomic.h.version.compile.pass.cpp      |    1 -
 .../stop_token.version.compile.pass.cpp       |    1 -
 .../string.version.compile.pass.cpp           |    1 -
 .../string_view.version.compile.pass.cpp      |    1 -
 .../syncstream.version.compile.pass.cpp       |    1 -
 .../thread.version.compile.pass.cpp           |    1 -
 .../tuple.version.compile.pass.cpp            |    1 -
 .../type_traits.version.compile.pass.cpp      |    1 -
 .../typeinfo.version.compile.pass.cpp         |    1 -
 .../unordered_map.version.compile.pass.cpp    |    1 -
 .../unordered_set.version.compile.pass.cpp    |    1 -
 .../utility.version.compile.pass.cpp          |    1 -
 .../variant.version.compile.pass.cpp          |    1 -
 .../vector.version.compile.pass.cpp           |    1 -
 .../version.version.compile.pass.cpp          |    1 -
 .../c.math/fdelayed-template-parsing.pass.cpp |    0
 .../test/std/numerics/c.math/signbit.pass.cpp |    2 +-
 .../numeric.ops.sat/add_sat.pass.cpp          |    2 +-
 .../numeric.ops.sat/div_sat.pass.cpp          |    2 +-
 .../numeric.ops.sat/mul_sat.pass.cpp          |    2 +-
 .../numeric.ops.sat/saturate_cast.pass.cpp    |    2 +-
 .../numeric.ops.sat/sub_sat.pass.cpp          |    2 +-
 .../rand.req.urng/valid_int_type.verify.cpp   |    0
 .../rand.req.urng/valid_real_type.verify.cpp  |    0
 .../range.zip/iterator/increment.pass.cpp     |    4 +-
 .../range.utility.conv/to_deduction.pass.cpp  |    3 -
 .../shared_ptr_array.pass.cpp                 |    1 -
 .../meta.unary.comp/is_bounded_array.pass.cpp |    3 -
 ...ue_object_representations.compile.pass.cpp |    3 -
 .../is_implicit_lifetime.pass.cpp             |    2 +-
 .../is_implicit_lifetime.verify.cpp           |    2 +-
 ...ference_constructs_from_temporary.pass.cpp |    2 +-
 ...reference_converts_from_temporary.pass.cpp |    2 +-
 .../optional.comp_with_t/equal.pass.cpp       |   21 +
 .../optional.comp_with_t/greater.pass.cpp     |   20 +
 .../greater_equal.pass.cpp                    |   20 +
 .../optional.comp_with_t/less_equal.pass.cpp  |   20 +
 .../optional.comp_with_t/less_than.pass.cpp   |   20 +
 .../optional.comp_with_t/not_equal.pass.cpp   |   21 +
 .../optional/optional.relops/equal.pass.cpp   |   14 +
 .../optional.relops/greater_equal.pass.cpp    |   13 +
 .../optional.relops/greater_than.pass.cpp     |   13 +
 .../optional.relops/less_equal.pass.cpp       |   13 +
 .../optional.relops/less_than.pass.cpp        |   13 +
 .../optional.relops/not_equal.pass.cpp        |   14 +
 .../bitset.members/to_ullong.pass.cpp         |    1 +
 .../mem.poly.allocator.mem/resource.pass.cpp  |   27 +-
 ...ct_on_container_copy_construction.pass.cpp |   28 +-
 libcxx/utils/ci/Dockerfile                    |   21 +-
 libcxx/utils/ci/docker-compose.yml            |    6 +-
 .../generate_feature_test_macro_components.py |    1 -
 lld/ELF/Arch/Hexagon.cpp                      |  119 +
 lld/ELF/Config.h                              |    1 +
 lld/ELF/Driver.cpp                            |    4 +
 lld/ELF/Relocations.cpp                       |    5 +-
 lld/ELF/Target.h                              |    1 +
 lld/MachO/Arch/ARM64.cpp                      |  509 -
 lld/MachO/CMakeLists.txt                      |    1 +
 lld/MachO/LinkerOptimizationHints.cpp         |  523 +
 lld/MachO/LinkerOptimizationHints.h           |   17 +
 lld/MachO/Target.h                            |    2 -
 lld/MachO/Writer.cpp                          |    6 +-
 lld/docs/ReleaseNotes.rst                     |    4 +
 lld/test/ELF/hexagon-attributes.s             |  150 +
 lld/test/MachO/loh-arm64-32.s                 |   64 +
 lldb/docs/resources/build.rst                 |    2 +
 lldb/docs/resources/qemu-testing.rst          |   47 +
 lldb/docs/use/mcp.md                          |   64 +-
 lldb/examples/python/filter_disasm.py         |   98 +
 lldb/examples/synthetic/gnu_libstdcpp.py      |   32 -
 lldb/include/lldb/Core/Opcode.h               |   39 +-
 lldb/include/lldb/Core/PluginManager.h        |    3 +
 .../lldb/Interpreter/CommandCompletions.h     |    4 +
 lldb/include/lldb/lldb-enumerations.h         |    3 +-
 .../Python/lldbsuite/test/lldbtest.py         |   15 +-
 .../Python/lldbsuite/test/make/Makefile.rules |    4 -
 lldb/source/Commands/CommandCompletions.cpp   |    8 +
 .../Commands/CommandObjectDWIMPrint.cpp       |    2 +
 .../Commands/CommandObjectExpression.cpp      |    3 +
 lldb/source/Commands/CommandObjectPlugin.cpp  |   24 +
 lldb/source/Core/Disassembler.cpp             |    6 +-
 lldb/source/Core/Opcode.cpp                   |   27 +-
 lldb/source/Core/PluginManager.cpp            |   32 +
 .../Host/common/NativeProcessProtocol.cpp     |   18 +-
 .../source/Interpreter/CommandInterpreter.cpp |    8 +-
 .../Disassembler/LLVMC/DisassemblerLLVMC.cpp  |   60 +-
 .../Plugins/Language/CPlusPlus/CMakeLists.txt |    4 +-
 .../Language/CPlusPlus/CPlusPlusLanguage.cpp  |  121 +-
 .../{LibCxxList.cpp => GenericList.cpp}       |  294 +-
 .../Plugins/Language/CPlusPlus/LibStdcpp.cpp  |   46 +
 .../Plugins/Language/CPlusPlus/LibStdcpp.h    |    4 +
 .../Plugins/Language/CPlusPlus/MsvcStl.h      |   20 +
 .../Language/CPlusPlus/MsvcStlTuple.cpp       |  105 +
 .../Language/CPlusPlus/MsvcStlVector.cpp      |  305 +
 .../Process/minidump/MinidumpParser.cpp       |   34 +-
 .../Plugins/Process/minidump/MinidumpParser.h |    3 +-
 .../Process/minidump/ProcessMinidump.cpp      |    9 +-
 .../TypeSystem/Clang/TypeSystemClang.cpp      |   14 -
 lldb/source/Target/TargetProperties.td        |    6 +-
 lldb/source/Utility/ArchSpec.cpp              |    4 +-
 lldb/source/ValueObject/DILEval.cpp           |    2 +-
 .../TestFrameVarDILArraySubscript.py          |    1 -
 .../QualifiedId/TestFrameVarDILQualifiedId.py |   15 +
 .../frame/var-dil/basics/QualifiedId/main.cpp |   21 +-
 lldb/test/API/commands/plugin/TestPlugin.py   |   46 +
 .../data-formatter-advanced/main.cpp          |    1 +
 .../TestDataFormatterDisabling.py             |    4 -
 .../TestDataFormatterGenericForwardList.py    |   33 +-
 .../list/TestDataFormatterGenericList.py      |   35 +-
 .../loop/TestDataFormatterGenericListLoop.py  |   18 +-
 .../generic/list/loop/main.cpp                |    5 -
 .../tuple/TestDataFormatterStdTuple.py        |    6 +
 .../data-formatter-stl/generic/tuple/main.cpp |    1 +
 .../vbool/TestDataFormatterStdVBool.py        |   24 +-
 .../data-formatter-stl/generic/vbool/main.cpp |   39 +-
 .../vector/TestDataFormatterStdVector.py      |   11 +
 .../API/lang/cpp/template/TestTemplateArgs.py |    1 -
 lldb/test/API/lit.cfg.py                      |   10 +-
 lldb/test/API/lit.site.cfg.py.in              |   10 +-
 lldb/test/API/python_api/type/main.cpp        |    7 +-
 .../lldb-dap/evaluate/TestDAP_evaluate.py     |    6 +-
 lldb/test/Shell/Commands/Inputs/dis_filt.py   |    8 +
 .../command-disassemble-riscv32-bytes.s       |   37 +
 .../Commands/command-disassemble-x86-bytes.s  |   28 +
 .../Shell/Minidump/missing-memory-region.yaml |   42 +
 .../Settings/TestChildCountTruncation.test    |   68 +
 .../Settings/TestChildDepthTruncation.test    |   84 +
 .../SymbolFile/DWARF/TestDedupWarnings.test   |    2 +-
 .../NativePDB/Inputs/class_layout.lldbinit    |    1 +
 lldb/test/Shell/helper/toolchain.py           |    5 +
 lldb/test/Shell/lit.site.cfg.py.in            |    9 +
 lldb/tools/lldb-dap/DAP.cpp                   |    2 +-
 .../RISCV/TestMCDisasmInstanceRISCV.cpp       |   62 +-
 .../Host/NativeProcessProtocolTest.cpp        |   93 +-
 .../Process/minidump/MinidumpParserTest.cpp   |   23 +-
 llvm/CMakeLists.txt                           |   11 +
 llvm/clang/test/Modules/implicit-opt-level.c  |   15 +
 llvm/docs/CommandGuide/index.rst              |    1 +
 llvm/docs/CommandGuide/llvm-ir2vec.rst        |  170 +
 llvm/docs/GettingInvolved.rst                 |    5 -
 llvm/docs/HowToSubmitABug.rst                 |   13 +-
 llvm/docs/HowToUpdateDebugInfo.rst            |   10 +-
 llvm/docs/KeyInstructionsDebugInfo.md         |  160 +
 llvm/docs/LangRef.rst                         |   48 +-
 llvm/docs/MLGO.rst                            |   12 +-
 llvm/docs/RISCVUsage.rst                      |    3 +
 llvm/docs/ReleaseNotes.md                     |   17 +
 llvm/docs/Security.rst                        |   76 +-
 llvm/docs/UserGuides.rst                      |    5 +
 llvm/include/llvm/ADT/EquivalenceClasses.h    |    8 +-
 llvm/include/llvm/ADT/StringTable.h           |   12 +-
 llvm/include/llvm/Analysis/IR2Vec.h           |   30 +-
 llvm/include/llvm/Analysis/VectorUtils.h      |    5 +
 llvm/include/llvm/BinaryFormat/ELF.h          |    3 +-
 .../llvm/BinaryFormat/MinidumpConstants.def   |    1 -
 llvm/include/llvm/BinaryFormat/SFrame.h       |  165 +
 llvm/include/llvm/CodeGen/AtomicExpandUtils.h |    3 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |    3 +-
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   23 +-
 .../llvm/CodeGen/MachineFunctionAnalysis.h    |    5 +
 llvm/include/llvm/CodeGen/PostRAMachineSink.h |    3 +-
 .../llvm/CodeGen/ProcessImplicitDefs.h        |   27 +
 llvm/include/llvm/CodeGen/TargetLowering.h    |   37 +-
 llvm/include/llvm/CodeGen/VirtRegMap.h        |    6 +-
 .../Frontend/OpenMP/ConstructDecompositionT.h |   18 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.h       |   20 +
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |    4 +
 llvm/include/llvm/IR/DebugInfo.h              |    5 +-
 llvm/include/llvm/IR/DebugInfoMetadata.h      |    3 -
 llvm/include/llvm/IR/Instruction.h            |   22 -
 llvm/include/llvm/IR/IntrinsicInst.h          |    7 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  172 +-
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |    7 +-
 llvm/include/llvm/IR/OptBisect.h              |   35 +-
 llvm/include/llvm/IR/PassManager.h            |   16 +
 llvm/include/llvm/IR/PatternMatch.h           |    2 +-
 llvm/include/llvm/IR/RuntimeLibcalls.h        |   17 +-
 llvm/include/llvm/IR/RuntimeLibcalls.td       |  129 +-
 llvm/include/llvm/InitializePasses.h          |    2 +-
 llvm/include/llvm/MC/MCAsmBackend.h           |   19 +-
 llvm/include/llvm/MC/MCAssembler.h            |   46 +-
 llvm/include/llvm/MC/MCCodeView.h             |    3 +-
 llvm/include/llvm/MC/MCContext.h              |    3 +-
 llvm/include/llvm/MC/MCELFStreamer.h          |   18 +-
 llvm/include/llvm/MC/MCFixup.h                |    1 -
 llvm/include/llvm/MC/MCObjectStreamer.h       |   25 +-
 llvm/include/llvm/MC/MCSection.h              |  335 +-
 llvm/include/llvm/MC/MCStreamer.h             |   23 +-
 llvm/include/llvm/MC/MCWasmStreamer.h         |    2 +-
 llvm/include/llvm/Object/ELFObjectFile.h      |    8 +-
 llvm/include/llvm/Pass.h                      |    4 +
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |   13 +-
 .../llvm/Passes/MachinePassRegistry.def       |   37 +-
 .../llvm/TableGen/StringToOffsetTable.h       |    7 +-
 .../include/llvm/Target/GlobalISel/Combine.td |   20 +-
 .../llvm/Target/TargetLoweringObjectFile.h    |    3 +
 llvm/include/llvm/TargetParser/Host.h         |    2 +-
 llvm/include/llvm/TargetParser/Triple.h       |  128 +-
 llvm/include/llvm/Transforms/Utils/Local.h    |   34 +-
 .../Utils/LockstepReverseIterator.h           |    6 +-
 .../Transforms/Utils/MemoryTaggingSupport.h   |    3 -
 .../Utils/ScalarEvolutionExpander.h           |    1 +
 llvm/lib/Analysis/CallGraph.cpp               |    1 -
 llvm/lib/Analysis/DependenceAnalysis.cpp      |   14 +-
 llvm/lib/Analysis/HashRecognize.cpp           |   42 +-
 llvm/lib/Analysis/IR2Vec.cpp                  |   71 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   13 +-
 llvm/lib/Analysis/LoopInfo.cpp                |    1 -
 .../lib/Analysis/MemoryDependenceAnalysis.cpp |    2 +-
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |    2 +-
 llvm/lib/Analysis/ValueTracking.cpp           |   12 +
 llvm/lib/Analysis/VectorUtils.cpp             |    9 +
 llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp    |    1 -
 llvm/lib/CodeGen/CodeGen.cpp                  |    2 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |    6 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |   45 +-
 llvm/lib/CodeGen/GlobalMergeFunctions.cpp     |    4 +
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |   88 +-
 llvm/lib/CodeGen/MachineFunctionAnalysis.cpp  |    6 +
 llvm/lib/CodeGen/MachineSink.cpp              |    3 +-
 llvm/lib/CodeGen/ProcessImplicitDefs.cpp      |   57 +-
 llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp  |    1 -
 llvm/lib/CodeGen/SafeStack.cpp                |   20 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  145 +-
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |    2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   19 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   49 +-
 llvm/lib/CodeGen/StackProtector.cpp           |   30 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |   16 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   17 +-
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |   23 +-
 llvm/lib/DWARFCFIChecker/Registers.h          |    3 +-
 llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp  |   10 +-
 .../DWARF/LowLevel/DWARFCFIProgram.cpp        |    4 -
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  105 +-
 llvm/lib/IR/DebugInfo.cpp                     |   25 +-
 llvm/lib/IR/DebugInfoMetadata.cpp             |    9 -
 llvm/lib/IR/Instruction.cpp                   |   19 -
 llvm/lib/IR/Instructions.cpp                  |    2 +-
 llvm/lib/IR/Intrinsics.cpp                    |   24 +-
 llvm/lib/IR/LegacyPassManager.cpp             |    1 -
 llvm/lib/IR/OptBisect.cpp                     |   42 +-
 llvm/lib/IR/Pass.cpp                          |   26 +-
 llvm/lib/IR/RuntimeLibcalls.cpp               |  153 +-
 llvm/lib/IR/Verifier.cpp                      |  212 +-
 llvm/lib/MC/MCAsmBackend.cpp                  |    9 +-
 llvm/lib/MC/MCAsmStreamer.cpp                 |   36 +-
 llvm/lib/MC/MCAssembler.cpp                   |  272 +-
 llvm/lib/MC/MCCodeView.cpp                    |    2 +-
 llvm/lib/MC/MCContext.cpp                     |   18 +-
 llvm/lib/MC/MCELFStreamer.cpp                 |  173 +-
 llvm/lib/MC/MCExpr.cpp                        |   10 +-
 llvm/lib/MC/MCFragment.cpp                    |   94 +-
 llvm/lib/MC/MCMachOStreamer.cpp               |    7 +-
 llvm/lib/MC/MCObjectStreamer.cpp              |  272 +-
 llvm/lib/MC/MCParser/AsmParser.cpp            |   77 +-
 llvm/lib/MC/MCSection.cpp                     |   59 +-
 llvm/lib/MC/MCStreamer.cpp                    |    3 -
 llvm/lib/MC/MCSymbol.cpp                      |    2 +-
 llvm/lib/MC/MCWasmStreamer.cpp                |    4 +-
 llvm/lib/MC/MCWin64EH.cpp                     |    2 +-
 llvm/lib/MC/MCWinCOFFStreamer.cpp             |   10 +-
 llvm/lib/MC/MCXCOFFStreamer.cpp               |    2 +-
 llvm/lib/MC/MachObjectWriter.cpp              |    2 +-
 llvm/lib/MC/WasmObjectWriter.cpp              |   10 +-
 llvm/lib/MC/WinCOFFObjectWriter.cpp           |    4 +-
 llvm/lib/Object/DXContainer.cpp               |    1 -
 llvm/lib/Object/ELF.cpp                       |    1 +
 llvm/lib/Object/ELFObjectFile.cpp             |    4 +
 llvm/lib/Object/RelocationResolver.cpp        |    6 +-
 llvm/lib/ObjectYAML/ELFYAML.cpp               |    2 +
 llvm/lib/Passes/PassBuilder.cpp               |    3 +
 llvm/lib/Passes/PassRegistry.def              |    4 +
 llvm/lib/Passes/StandardInstrumentations.cpp  |   10 +-
 llvm/lib/Support/BLAKE3/blake3_dispatch.c     |    2 +-
 llvm/lib/Support/BLAKE3/blake3_impl.h         |    2 +-
 llvm/lib/Support/BLAKE3/blake3_neon.c         |    9 +-
 llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h  |    2 +
 llvm/lib/Support/rpmalloc/rpmalloc.c          |   18 +-
 llvm/lib/TableGen/StringToOffsetTable.cpp     |   11 +-
 llvm/lib/TableGen/TGLexer.cpp                 |  267 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |   24 +-
 llvm/lib/Target/AArch64/AArch64Features.td    |    7 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  157 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    9 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  324 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |    4 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   14 +
 .../AArch64/AArch64LoadStoreOptimizer.cpp     |    2 +-
 llvm/lib/Target/AArch64/AArch64Processors.td  |   18 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   18 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |    1 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |   91 +-
 .../AArch64/AArch64TargetTransformInfo.h      |    8 +-
 .../MCTargetDesc/AArch64AsmBackend.cpp        |    8 +-
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp   |    6 +-
 .../MCTargetDesc/AArch64ELFStreamer.cpp       |   11 +-
 .../MCTargetDesc/AArch64MCTargetDesc.cpp      |    2 +-
 .../MCTargetDesc/AArch64MachObjectWriter.cpp  |    2 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h               |    3 +
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   23 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |    2 -
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  294 -
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |   12 +
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  110 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |    3 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   84 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |    1 +
 .../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp    |   24 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   93 +
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |    9 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   60 +-
 llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp  |    5 +
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |    1 +
 .../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp  |  108 +
 .../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h    |   23 +
 .../AMDGPU/AMDGPUPrintfRuntimeBinding.cpp     |    2 +-
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp    |    1 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   38 +
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    |  234 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |    4 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   19 +
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h  |    3 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |   42 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    1 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td    |  504 +-
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |   46 +
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |    1 +
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |    3 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    8 +
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp  |    2 +-
 .../MCTargetDesc/AMDGPUELFObjectWriter.cpp    |    4 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |   10 +
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |    2 +
 .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp   |    1 +
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp      |   18 +-
 .../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h |    1 +
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |   20 +-
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |   14 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  263 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  508 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  256 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   29 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   12 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   55 +-
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |    3 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |    2 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |    8 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |    6 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |    1 +
 llvm/lib/Target/AMDGPU/SISchedule.td          |   21 +
 .../Target/AMDGPU/SIShrinkInstructions.cpp    |   18 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |   10 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |    7 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |    9 +
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |   23 +
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |   19 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |   29 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  392 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |   15 +-
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         |    6 -
 llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp  |    4 +-
 llvm/lib/Target/ARM/ARMFastISel.cpp           |    8 +-
 llvm/lib/Target/ARM/ARMFeatures.td            |    6 -
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      |    4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   17 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |    2 +-
 llvm/lib/Target/ARM/ARMInstrInfo.td           |   20 +-
 llvm/lib/Target/ARM/ARMPredicates.td          |    4 -
 llvm/lib/Target/ARM/ARMSubtarget.cpp          |    9 +-
 llvm/lib/Target/ARM/ARMSubtarget.h            |    1 -
 llvm/lib/Target/ARM/ARMTargetMachine.cpp      |    5 +-
 llvm/lib/Target/ARM/ARMTargetObjectFile.cpp   |    1 -
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |    6 +-
 .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp |   11 +-
 .../Target/ARM/MCTargetDesc/ARMAsmBackend.h   |    3 +-
 .../ARM/MCTargetDesc/ARMELFObjectWriter.cpp   |    4 +-
 .../ARM/MCTargetDesc/ARMELFStreamer.cpp       |   19 +-
 .../ARM/MCTargetDesc/ARMMCTargetDesc.cpp      |    6 -
 .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp  |    4 +-
 .../Target/BPF/MCTargetDesc/BPFAsmBackend.cpp |    2 +-
 .../CSKY/MCTargetDesc/CSKYAsmBackend.cpp      |   11 +-
 .../Target/CSKY/MCTargetDesc/CSKYAsmBackend.h |    3 +-
 .../CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp |    2 +-
 .../Target/DirectX/DXILDataScalarization.cpp  |    9 +-
 llvm/lib/Target/DirectX/DXILFlattenArrays.cpp |  308 +-
 llvm/lib/Target/DirectX/DXILLegalizePass.cpp  |   74 +-
 .../Target/DirectX/DirectXTargetMachine.cpp   |    2 +-
 .../Hexagon/HexagonConstPropagation.cpp       |    2 +-
 .../Target/Hexagon/HexagonISelDAGToDAG.cpp    |    9 +
 .../Target/Hexagon/HexagonISelLowering.cpp    |   14 +-
 llvm/lib/Target/Hexagon/HexagonISelLowering.h |    3 +
 llvm/lib/Target/Hexagon/HexagonPatterns.td    |    5 +
 .../MCTargetDesc/HexagonAsmBackend.cpp        |   19 +-
 .../MCTargetDesc/HexagonELFObjectWriter.cpp   |    2 +-
 llvm/lib/Target/Lanai/LanaiFrameLowering.cpp  |   12 +-
 .../LoongArch/LoongArchFloat32InstrInfo.td    |    2 +-
 .../LoongArch/LoongArchFrameLowering.cpp      |    7 +-
 .../LoongArch/LoongArchISelLowering.cpp       |   10 +-
 .../LoongArch/LoongArchLASXInstrInfo.td       |   93 +-
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |   58 +-
 .../MCTargetDesc/LoongArchAsmBackend.cpp      |   43 +-
 .../MCTargetDesc/LoongArchAsmBackend.h        |    8 +-
 .../MCTargetDesc/LoongArchELFObjectWriter.cpp |    2 +-
 .../MCTargetDesc/MSP430ELFObjectWriter.cpp    |    2 +-
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   |    2 +-
 .../Target/Mips/MCTargetDesc/CMakeLists.txt   |    1 -
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp |    2 +-
 .../lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h |   31 -
 .../Mips/MCTargetDesc/MipsMCTargetDesc.cpp    |    9 +-
 .../Mips/MCTargetDesc/MipsNaClELFStreamer.cpp |  274 -
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |   12 +-
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp       |   33 +-
 llvm/lib/Target/Mips/MipsAsmPrinter.h         |    2 -
 llvm/lib/Target/Mips/MipsBranchExpansion.cpp  |   25 +-
 llvm/lib/Target/Mips/MipsCallingConv.td       |   11 +-
 llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp  |   13 -
 llvm/lib/Target/Mips/MipsInstrFPU.td          |   18 +-
 llvm/lib/Target/Mips/MipsInstrInfo.td         |    1 -
 llvm/lib/Target/Mips/MipsRegisterInfo.cpp     |    7 -
 llvm/lib/Target/Mips/MipsSubtarget.h          |    1 -
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp   |   24 +-
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h     |    4 +-
 llvm/lib/Target/NVPTX/NVPTX.h                 |    4 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |   51 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h     |    5 +
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  260 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |   47 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |  196 +-
 llvm/lib/Target/NVPTX/NVPTXUtilities.h        |    2 +
 .../MCTargetDesc/PPCELFObjectWriter.cpp       |    7 +-
 .../PowerPC/MCTargetDesc/PPCELFStreamer.cpp   |    9 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |   12 +
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |    3 +-
 .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    |  103 +-
 .../RISCV/MCTargetDesc/RISCVAsmBackend.h      |   11 +-
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   12 +-
 .../MCTargetDesc/RISCVELFObjectWriter.cpp     |    5 +-
 .../RISCV/MCTargetDesc/RISCVFixupKinds.h      |    4 +
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |    2 +
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp  |   42 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |   11 +
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |   32 +-
 llvm/lib/Target/RISCV/RISCVFrameLowering.h    |    3 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  175 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   81 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   15 +-
 llvm/lib/Target/RISCV/RISCVInstrFormats.td    |    4 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |    4 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |   54 +
 llvm/lib/Target/RISCV/RISCVInstrInfoP.td      |   12 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td |   35 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |  148 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td  |    1 +
 .../Target/RISCV/RISCVInterleavedAccess.cpp   |  464 +-
 .../lib/Target/RISCV/RISCVSchedSpacemitX60.td |  532 +-
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp    |   26 +-
 .../RISCV/RISCVVectorMaskDAGMutation.cpp      |   27 +-
 .../SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp   |    1 -
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        |   27 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |    2 +
 .../MCTargetDesc/SparcELFObjectWriter.cpp     |    4 +-
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp |    1 -
 .../Target/Sparc/SparcTargetObjectFile.cpp    |    1 -
 llvm/lib/Target/TargetLoweringObjectFile.cpp  |   29 +
 .../Target/VE/MCTargetDesc/VEAsmBackend.cpp   |    2 +-
 .../VE/MCTargetDesc/VEELFObjectWriter.cpp     |    4 +-
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp      |    1 -
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |    1 -
 .../Target/X86/MCTargetDesc/X86AsmBackend.cpp |   67 +-
 .../X86/MCTargetDesc/X86MachObjectWriter.cpp  |    4 +-
 llvm/lib/Target/X86/X86CallingConv.cpp        |   31 +
 llvm/lib/Target/X86/X86CallingConv.td         |    5 +
 llvm/lib/Target/X86/X86ExpandPseudo.cpp       |    3 +-
 llvm/lib/Target/X86/X86FrameLowering.cpp      |    6 +-
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |    4 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |    9 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |    2 +-
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |   15 +-
 llvm/lib/Target/X86/X86InstrPredicates.td     |    2 -
 llvm/lib/Target/X86/X86InterleavedAccess.cpp  |    7 +-
 llvm/lib/Target/X86/X86Subtarget.cpp          |    5 +-
 llvm/lib/Target/X86/X86Subtarget.h            |   11 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp      |    6 +-
 llvm/lib/Target/X86/X86WinEHState.cpp         |    4 +-
 .../Xtensa/MCTargetDesc/XtensaAsmBackend.cpp  |    2 +-
 llvm/lib/TargetParser/ARMTargetParser.cpp     |    1 -
 llvm/lib/TargetParser/Host.cpp                |   12 +-
 llvm/lib/TargetParser/TargetParser.cpp        |    1 +
 llvm/lib/TargetParser/Triple.cpp              |  326 +-
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  |    7 +
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  |    5 +-
 llvm/lib/Transforms/IPO/Attributor.cpp        |    2 +-
 .../Transforms/IPO/AttributorAttributes.cpp   |    7 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         |    4 +-
 llvm/lib/Transforms/IPO/IROutliner.cpp        |    7 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |    4 +-
 .../InstCombine/InstCombineCalls.cpp          |  127 +-
 .../InstCombine/InstCombineCompares.cpp       |   11 +-
 .../InstCombine/InstCombineInternal.h         |   16 +-
 .../InstCombineLoadStoreAlloca.cpp            |   28 +-
 .../InstCombine/InstCombineSelect.cpp         |   39 +-
 .../InstCombine/InstructionCombining.cpp      |   89 +-
 .../Instrumentation/AddressSanitizer.cpp      |    6 +-
 .../Instrumentation/GCOVProfiling.cpp         |    1 -
 .../Instrumentation/HWAddressSanitizer.cpp    |    4 +-
 .../Instrumentation/LowerAllowCheckPass.cpp   |   19 +-
 .../Instrumentation/MemorySanitizer.cpp       |   13 +-
 .../Instrumentation/ValueProfilePlugins.inc   |    4 +-
 .../Scalar/ConstraintElimination.cpp          |    1 +
 .../Scalar/DeadStoreElimination.cpp           |    1 +
 llvm/lib/Transforms/Scalar/GVN.cpp            |   15 +-
 llvm/lib/Transforms/Scalar/GVNSink.cpp        |    2 +-
 .../lib/Transforms/Scalar/LoopInterchange.cpp |   91 +-
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |    2 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |    1 -
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp |    4 +-
 llvm/lib/Transforms/Scalar/SROA.cpp           |  107 +-
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |    3 +-
 .../Scalar/SpeculativeExecution.cpp           |    1 -
 llvm/lib/Transforms/Utils/CloneFunction.cpp   |   30 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |    8 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |    3 +-
 llvm/lib/Transforms/Utils/Local.cpp           |  392 +-
 .../Transforms/Utils/MemoryTaggingSupport.cpp |   26 +-
 .../Utils/PromoteMemoryToRegister.cpp         |  157 +-
 .../Utils/ScalarEvolutionExpander.cpp         |   25 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |   30 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   76 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   53 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   52 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |    1 -
 .../Vectorize/VPlanConstruction.cpp           |   24 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   10 +
 .../Transforms/Vectorize/VPlanTransforms.h    |    4 +
 .../CostModel/AArch64/arith-overflow.ll       |   96 +-
 llvm/test/Analysis/CostModel/AArch64/cmp.ll   |    8 +-
 llvm/test/Analysis/CostModel/AArch64/fcmp.ll  |  256 +-
 llvm/test/Analysis/CostModel/AArch64/fshl.ll  |    6 +-
 llvm/test/Analysis/CostModel/AArch64/fshr.ll  |    6 +-
 .../Analysis/CostModel/AArch64/reduce-and.ll  |   20 +-
 .../Analysis/CostModel/AArch64/reduce-or.ll   |   20 +-
 .../Analysis/CostModel/AArch64/reduce-xor.ll  |   20 +-
 .../CostModel/AArch64/shuffle-other.ll        |   22 +-
 .../CostModel/AArch64/shuffle-select.ll       |    6 +-
 .../Analysis/CostModel/AArch64/sve-cmpsel.ll  |   12 +-
 .../Analysis/CostModel/AArch64/sve-div.ll     |    8 +
 .../Analysis/CostModel/AArch64/sve-fcmp.ll    |  138 +-
 .../CostModel/AArch64/sve-intrinsics.ll       |   52 +-
 .../CostModel/AArch64/vector-select.ll        |   68 +-
 .../Analysis/CostModel/AMDGPU/reduce-and.ll   |    4 +-
 .../Analysis/CostModel/AMDGPU/reduce-or.ll    |    4 +-
 .../Analysis/CostModel/ARM/arith-overflow.ll  |  180 +-
 .../test/Analysis/CostModel/ARM/arith-ssat.ll |  100 +-
 .../test/Analysis/CostModel/ARM/arith-usat.ll |  100 +-
 llvm/test/Analysis/CostModel/ARM/cmps.ll      |   46 +-
 .../CostModel/ARM/intrinsic-cost-kinds.ll     |    6 +-
 llvm/test/Analysis/CostModel/ARM/mve-abs.ll   |   12 +-
 .../test/Analysis/CostModel/ARM/mve-minmax.ll |   48 +-
 llvm/test/Analysis/CostModel/ARM/select.ll    |  104 +-
 .../DependenceAnalysis/DifferentOffsets.ll    |    6 +-
 .../DependenceAnalysis/MIVCheckConst.ll       |    3 +
 .../HashRecognize/cyclic-redundancy-check.ll  |   58 +-
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  232 +
 llvm/test/CMakeLists.txt                      |    1 +
 .../AArch64/aarch64-combine-gather-lanes.mir  |  364 +
 .../CodeGen/AArch64/arm64-vector-insertion.ll |   71 +-
 .../AArch64/cgdata-no-merge-unnamed.ll        |   32 +
 .../complex-deinterleaving-uniform-cases.ll   |  134 +-
 llvm/test/CodeGen/AArch64/concat-vector.ll    |    5 +-
 .../AArch64/fp-maximumnum-minimumnum.ll       |   50 +-
 llvm/test/CodeGen/AArch64/fsh.ll              |  113 +-
 llvm/test/CodeGen/AArch64/llvm.frexp.ll       |   14 +-
 llvm/test/CodeGen/AArch64/load-combine.ll     |  179 +
 .../machine-outliner-safe-range-in-middle.mir |   42 +-
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   |  345 +-
 llvm/test/CodeGen/AArch64/nontemporal.ll      |   48 +-
 llvm/test/CodeGen/AArch64/pr148949.ll         |   22 +
 llvm/test/CodeGen/AArch64/rem-by-const.ll     |  819 +-
 .../CodeGen/AArch64/saturating-vec-smull.ll   |  223 +
 llvm/test/CodeGen/AArch64/spill-fold.mir      |   19 +
 llvm/test/CodeGen/AArch64/spillfill-sve.mir   |   49 +-
 .../AArch64/sve-pseudos-expand-undef.mir      |  101 +-
 .../AArch64/unsupported-object-format-err.ll  |   14 +
 llvm/test/CodeGen/AArch64/vecreduce-fadd.ll   |    3 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll  |    8 +-
 .../AMDGPU/GlobalISel/inst-select-bswap.mir   |   19 -
 .../AMDGPU/GlobalISel/inst-select-fshr.mir    |   22 +-
 .../inst-select-load-atomic-flat.mir          |   30 +-
 .../inst-select-load-atomic-global.mir        |   24 +-
 .../inst-select-store-atomic-flat.mir         |   22 +-
 .../AMDGPU/GlobalISel/known-bits-sbfe.mir     |  253 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll   |    8 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll     |  158 +-
 llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir     |   83 +
 llvm/test/CodeGen/AMDGPU/agpr-remat.ll        |   18 +-
 .../amdgpu-codegenprepare-i16-to-i32.ll       | 2853 ------
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |   20 +-
 .../AMDGPU/amdgpu-prepare-agpr-alloc.mir      |   95 +
 llvm/test/CodeGen/AMDGPU/bf16-math.ll         |   51 +
 .../branch-folding-implicit-def-subreg.ll     |   52 +-
 .../AMDGPU/branch-relaxation-gfx1250.ll       |  630 ++
 .../AMDGPU/buffer-fat-pointers-memcpy.ll      |  105 +-
 ...ug-multi-operands-to-update-after-fold.mir |   15 +
 .../test/CodeGen/AMDGPU/code-size-estimate.ll |  314 +
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |   18 +-
 llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll   | 2223 +++++
 llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll  | 1118 +++
 llvm/test/CodeGen/AMDGPU/fma.f64.ll           |   45 +-
 .../test/CodeGen/AMDGPU/fold-commute-sgpr.mir |   24 +
 llvm/test/CodeGen/AMDGPU/freeze.ll            |  240 +-
 llvm/test/CodeGen/AMDGPU/global-address.ll    |  138 +
 .../CodeGen/AMDGPU/inflate-av-remat-imm.mir   |   38 +-
 ...-reg-class-snippet-copy-use-after-free.mir |    2 +-
 .../AMDGPU/insert-delay-alu-wmma-xdl.mir      |   67 +
 llvm/test/CodeGen/AMDGPU/literal64.ll         |  335 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |    6 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |    4 +
 .../CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll    |   33 +
 .../CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll    |   33 +
 .../CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll    |   33 +
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |    4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll    |   36 +
 .../CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll    |   95 +
 .../CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll    |   33 +
 .../CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll   |   33 +
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll  |  170 +-
 .../AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll    |  840 ++
 .../llvm.amdgcn.wmma.imm.gfx1250.w32.ll       | 2238 +++++
 .../llvm.amdgcn.wmma.imod.gfx1250.w32.ll      | 1993 ++++
 .../llvm.amdgcn.wmma.index.gfx1250.w32.ll     |  690 ++
 llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll    | 1013 ++
 llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll    |  240 +
 llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll     |    1 +
 llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll    |    3 +-
 llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll    |  108 +
 .../CodeGen/AMDGPU/merge-tbuffer-gfx10.mir    | 1526 +++
 .../CodeGen/AMDGPU/merge-tbuffer-gfx11.mir    | 1527 +++
 .../CodeGen/AMDGPU/merge-tbuffer-gfx9.mir     | 1553 +++
 llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir    | 8708 -----------------
 .../AMDGPU/move-load-addr-to-valu-flat.mir    |  357 +
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |   10 +-
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir    |   28 +-
 .../CodeGen/AMDGPU/no-fold-accvgpr-read.mir   |   26 +-
 llvm/test/CodeGen/AMDGPU/opencl-printf.ll     |  270 +-
 ...al-regcopy-and-spill-missed-at-regalloc.ll |   20 +-
 .../test/CodeGen/AMDGPU/peephole-fold-imm.mir |  141 +-
 .../CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir |   56 +
 .../AMDGPU/ptradd-sdag-optimizations.ll       |  196 +-
 .../AMDGPU/regalloc-undef-copy-fold.mir       |   79 +
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |   18 +-
 .../AMDGPU/select-cmp-shared-constant-fp.ll   | 1429 +++
 .../AMDGPU/select-cmp-shared-constant-int.ll  |  955 ++
 llvm/test/CodeGen/AMDGPU/select-undef.ll      |   20 +
 .../test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll |  100 +
 .../CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir     |   31 +
 llvm/test/CodeGen/AMDGPU/shrink-fma-f64.mir   |   62 +
 llvm/test/CodeGen/AMDGPU/spill-agpr.mir       |  120 +-
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/srem.ll              |    4 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |   88 +-
 ...-last-chance-recoloring-alloc-succeeds.mir |   22 +-
 .../AMDGPU/trans-coexecution-hazard.mir       |  132 +
 llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir  |   30 +-
 .../test/CodeGen/AMDGPU/vector-reduce-smax.ll |  347 +-
 .../test/CodeGen/AMDGPU/vector-reduce-smin.ll |  347 +-
 .../test/CodeGen/AMDGPU/vector-reduce-umax.ll |  347 +-
 .../test/CodeGen/AMDGPU/vector-reduce-umin.ll |  347 +-
 llvm/test/CodeGen/ARM/fast-isel-align.ll      |    3 -
 llvm/test/CodeGen/ARM/llvm.frexp.ll           |   53 +
 .../CodeGen/ARM/stack-protector-target.ll     |   68 +
 llvm/test/CodeGen/ARM/struct_byval.ll         |   14 -
 llvm/test/CodeGen/ARM/trap.ll                 |   20 -
 .../ARM/varargs-spill-stack-align-nacl.ll     |   31 -
 llvm/test/CodeGen/DirectX/flatten-array.ll    |   86 +-
 .../CodeGen/DirectX/flatten-bug-117273.ll     |   10 +-
 llvm/test/CodeGen/DirectX/legalize-i8.ll      |   10 +-
 .../legalize-load-store-array-alloca.ll       |   41 +
 llvm/test/CodeGen/DirectX/legalize-memcpy.ll  |   56 +-
 llvm/test/CodeGen/DirectX/legalize-memset.ll  |   20 +-
 llvm/test/CodeGen/DirectX/llc-pipeline.ll     |    4 +-
 .../DirectX/llc-vector-load-scalarize.ll      |  187 +-
 .../test/CodeGen/DirectX/scalar-bug-117273.ll |   18 +-
 llvm/test/CodeGen/DirectX/scalar-store.ll     |   32 +-
 llvm/test/CodeGen/DirectX/scalarize-alloca.ll |    4 +-
 llvm/test/CodeGen/Hexagon/llvm.exp10.ll       |  232 +
 llvm/test/CodeGen/Hexagon/llvm.frexp.ll       |  803 ++
 .../test/CodeGen/Hexagon/mpy-operand-hoist.ll |   38 +
 llvm/test/CodeGen/Hexagon/thread-pointer.ll   |   16 +
 .../CodeGen/LoongArch/calling-conv-common.ll  |   48 +-
 .../CodeGen/LoongArch/calling-conv-half.ll    |   16 +-
 .../LoongArch/can-not-realign-stack.ll        |   44 +-
 .../CodeGen/LoongArch/emergency-spill-slot.ll |    4 +-
 llvm/test/CodeGen/LoongArch/frame.ll          |  107 +-
 .../CodeGen/LoongArch/intrinsic-memcpy.ll     |    8 +-
 .../CodeGen/LoongArch/lasx/build-vector.ll    |  154 +-
 llvm/test/CodeGen/LoongArch/lasx/fpowi.ll     |   88 +-
 .../ir-instruction/bitcast-extract-element.ll |    4 -
 .../lasx/ir-instruction/extractelement.ll     |  120 +-
 .../lasx/ir-instruction/fix-xvshuf.ll         |   12 +-
 .../ir-instruction/insert-bitcast-element.ll  |    4 -
 .../ir-instruction/insert-extract-element.ll  |   30 +-
 .../insert-extract-pair-elements.ll           |   64 +-
 .../lasx/ir-instruction/insertelement.ll      |  138 +-
 llvm/test/CodeGen/LoongArch/llvm.exp10.ll     |  362 +
 llvm/test/CodeGen/LoongArch/llvm.sincos.ll    |  150 +-
 .../ir-instruction/bitcast-extract-element.ll |    6 +-
 .../ir-instruction/insert-bitcast-element.ll  |    4 -
 .../ir-instruction/insert-extract-element.ll  |   20 +-
 llvm/test/CodeGen/LoongArch/lsx/pr146455.ll   |  287 +
 .../LoongArch/stack-protector-target.ll       |  104 +
 ...realignment-with-variable-sized-objects.ll |   24 +-
 .../CodeGen/LoongArch/stack-realignment.ll    |   80 +-
 .../LoongArch/unaligned-memcpy-inline.ll      |   14 +-
 llvm/test/CodeGen/LoongArch/vararg.ll         |   70 +-
 llvm/test/CodeGen/MIR/AMDGPU/target-flags.mir |   16 +-
 llvm/test/CodeGen/Mips/fastcc.ll              |   12 -
 llvm/test/CodeGen/Mips/fp-indexed-ls.ll       |   11 -
 .../Mips/indirect-jump-hazard/long-branch.ll  |    1 -
 llvm/test/CodeGen/Mips/ldexp.ll               |   15 +
 llvm/test/CodeGen/Mips/longbranch.ll          |   38 -
 llvm/test/CodeGen/Mips/nacl-align.ll          |   99 -
 llvm/test/CodeGen/Mips/nacl-branch-delay.ll   |   71 -
 llvm/test/CodeGen/Mips/nacl-reserved-regs.ll  |   51 -
 llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll   |   30 +-
 llvm/test/CodeGen/NVPTX/atomics-sm70.ll       |    8 +-
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll       |    8 +-
 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll       | 5344 ++--------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll       | 5344 ++--------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll       | 5349 ++--------
 llvm/test/CodeGen/NVPTX/cmpxchg.ll            |  180 +-
 llvm/test/CodeGen/NVPTX/cmpxchg.py            |   90 +-
 .../NVPTX/distributed-shared-cluster.ll       |   46 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  | 2630 +++--
 .../NVPTX/no-stack-protector-libcall-error.ll |   10 +
 llvm/test/CodeGen/NVPTX/prmt-const-folding.ll |  171 +
 .../PowerPC/spe-vsx-incompatibility.ll        |    8 +
 .../CodeGen/PowerPC/stack-protector-target.ll |  164 +
 llvm/test/CodeGen/RISCV/attributes.ll         |    4 +
 llvm/test/CodeGen/RISCV/features-info.ll      |    1 +
 llvm/test/CodeGen/RISCV/fpenv-xlen.ll         |   34 +
 .../RISCV/rvv/combine-reduce-add-to-vcpop.ll  |   18 +-
 llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll  |   38 +-
 .../rvv/fixed-vectors-deinterleave-load.ll    |   53 +
 .../RISCV/rvv/fixed-vectors-extract-i1.ll     |  108 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    |  194 +
 .../rvv/fixed-vectors-interleaved-access.ll   |   32 +-
 .../rvv/fixed-vectors-shuffle-deinterleave.ll |   26 +-
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       |    5 +-
 .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll      |    5 +-
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       |   11 +-
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      |   11 +-
 .../CodeGen/RISCV/rvv/reproducer-pr146855.ll  |   72 +
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll  |   36 +-
 .../test/CodeGen/RISCV/rvv/vl-opt-op-info.mir |   59 +
 llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll   |  194 +
 .../RISCV/rvv/vp-vector-interleaved-access.ll |  262 +-
 .../RISCV/rvv/vscale-vw-web-simplification.ll |  156 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll     |   14 +-
 .../RISCV/rvv/vxrm-insert-out-of-loop.ll      |   32 +-
 .../RISCV/stack-probing-frame-setup.mir       |  198 +
 .../CodeGen/RISCV/stack-protector-target.ll   |   24 +
 llvm/test/CodeGen/RISCV/xandesbfhcvt.ll       |   23 +
 .../test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll |    1 +
 llvm/test/CodeGen/RISCV/xqcibm-insert.ll      |   53 +
 llvm/test/CodeGen/RISCV/xqcicli.ll            |  717 ++
 llvm/test/CodeGen/RISCV/xqcisls.ll            |  282 +-
 llvm/test/CodeGen/RISCV/xtheadmemidx.ll       |   40 +-
 .../CodeGen/SPARC/stack-protector-target.ll   |  141 +
 .../CodeGen/SPIRV/hlsl-intrinsics/refract.ll  |   36 +
 .../CodeGen/SPIRV/opencl/refract-error.ll     |   12 +
 .../SPIRV/transcoding/builtin_calls.ll        |  100 +-
 llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll  |   42 +-
 .../CodeGen/Thumb2/pacbti-m-outliner-1.ll     |    6 +-
 llvm/test/CodeGen/X86/abds-neg.ll             |  410 +-
 llvm/test/CodeGen/X86/abds.ll                 |  390 +-
 llvm/test/CodeGen/X86/abdu-neg.ll             |  282 +-
 llvm/test/CodeGen/X86/abdu.ll                 |  225 +-
 llvm/test/CodeGen/X86/abs.ll                  |   55 +-
 llvm/test/CodeGen/X86/add-sub-bool.ll         |   25 +-
 llvm/test/CodeGen/X86/arg-copy-elide.ll       |    8 +-
 llvm/test/CodeGen/X86/avx512fp16-cvt.ll       |   42 +-
 llvm/test/CodeGen/X86/bitselect.ll            |   55 +-
 llvm/test/CodeGen/X86/bsf.ll                  |  144 +-
 llvm/test/CodeGen/X86/bsr.ll                  |  158 +-
 llvm/test/CodeGen/X86/bswap-wide-int.ll       |   30 +-
 llvm/test/CodeGen/X86/constructor.ll          |   13 -
 .../X86/div-rem-pair-recomposition-signed.ll  |   36 +-
 .../div-rem-pair-recomposition-unsigned.ll    |  421 +-
 llvm/test/CodeGen/X86/fast-isel-x32.ll        |    1 -
 llvm/test/CodeGen/X86/fp128-cast-strict.ll    |   92 +-
 llvm/test/CodeGen/X86/fp128-cast.ll           |  125 +-
 .../test/CodeGen/X86/fp128-libcalls-strict.ll | 2060 ++--
 llvm/test/CodeGen/X86/fp128-libcalls.ll       | 1773 ++--
 llvm/test/CodeGen/X86/frameaddr.ll            |   11 -
 llvm/test/CodeGen/X86/freeze-unary.ll         |   80 +
 llvm/test/CodeGen/X86/fshl.ll                 |  185 +-
 llvm/test/CodeGen/X86/fshr.ll                 |  170 +-
 llvm/test/CodeGen/X86/funnel-shift.ll         |   74 +-
 llvm/test/CodeGen/X86/i128-add.ll             |   23 +-
 llvm/test/CodeGen/X86/i128-fp128-abi.ll       |  706 +-
 llvm/test/CodeGen/X86/i128-sdiv.ll            |   61 +-
 llvm/test/CodeGen/X86/i128-udiv.ll            |   12 +-
 llvm/test/CodeGen/X86/iabs.ll                 |   43 +-
 llvm/test/CodeGen/X86/icmp-shift-opt.ll       |  102 +-
 .../X86/invalid-operand-bundle-call.ll        |    4 +-
 .../X86/invalid-operand-bundle-callbr.ll      |    2 +-
 .../X86/invalid-operand-bundle-invoke.ll      |    2 +-
 llvm/test/CodeGen/X86/kcfi.ll                 |   23 +
 llvm/test/CodeGen/X86/lea-2.ll                |    1 -
 llvm/test/CodeGen/X86/lea-3.ll                |    1 -
 llvm/test/CodeGen/X86/lea-4.ll                |    1 -
 llvm/test/CodeGen/X86/lea-5.ll                |    1 -
 llvm/test/CodeGen/X86/lea.ll                  |    1 -
 llvm/test/CodeGen/X86/mul128.ll               |   97 +-
 llvm/test/CodeGen/X86/neg-abs.ll              |   55 +-
 llvm/test/CodeGen/X86/pcsections-atomics.ll   | 3378 +++++++
 llvm/test/CodeGen/X86/popcnt.ll               |  485 +-
 llvm/test/CodeGen/X86/pr46004.ll              |   19 +
 llvm/test/CodeGen/X86/scalar-fp-to-i32.ll     |   76 +-
 llvm/test/CodeGen/X86/scalar-fp-to-i64.ll     |   76 +-
 llvm/test/CodeGen/X86/scmp.ll                 |   39 +-
 llvm/test/CodeGen/X86/sdiv_fix.ll             |   99 +-
 llvm/test/CodeGen/X86/sdiv_fix_sat.ll         |  440 +-
 llvm/test/CodeGen/X86/shift-combine.ll        |   14 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |   72 +-
 llvm/test/CodeGen/X86/smax.ll                 |   78 +-
 llvm/test/CodeGen/X86/smin.ll                 |   81 +-
 llvm/test/CodeGen/X86/stack-align2.ll         |    5 -
 .../X86/stack-protector-target-openbsd.ll     |   81 +
 llvm/test/CodeGen/X86/test-shrink-bug.ll      |   19 +-
 llvm/test/CodeGen/X86/ucmp.ll                 |   34 +-
 llvm/test/CodeGen/X86/udiv_fix.ll             |   28 +-
 llvm/test/CodeGen/X86/udiv_fix_sat.ll         |   28 +-
 llvm/test/CodeGen/X86/umax.ll                 |  135 +-
 llvm/test/CodeGen/X86/umin.ll                 |   81 +-
 .../X86/umulo-128-legalisation-lowering.ll    |    6 +-
 .../CodeGen/X86/unreachable-mbb-undef-phi.mir |    1 +
 llvm/test/CodeGen/X86/wide-integer-cmp.ll     |   14 +-
 .../CodeGen/X86/win32-int-runtime-libcalls.ll |  113 +
 llvm/test/CodeGen/X86/x86-64-baseptr.ll       |    8 -
 .../CodeGen/X86/x86-64-stack-and-frame-ptr.ll |    9 -
 llvm/test/CodeGen/XCore/llvm.frexp.ll         |   85 +
 .../KeyInstructions/Generic/verify.ll         |    3 +
 .../MemorySanitizer/X86/avx-intrinsics-x86.ll |   54 +-
 .../X86/avx512-gfni-intrinsics.ll             |  670 ++
 .../X86/avx512-intrinsics-upgrade.ll          |  210 +-
 .../MemorySanitizer/X86/avx512-intrinsics.ll  |  226 +-
 .../X86/avx512bw-intrinsics-upgrade.ll        |   58 +-
 .../X86/avx512bw-intrinsics.ll                |   58 +-
 .../X86/avx512vl-intrinsics.ll                |  206 +-
 .../MemorySanitizer/X86/x86-vpermi2.ll        |   26 +-
 .../i386/avx-intrinsics-i386.ll               |   54 +-
 .../Instrumentation/MemorySanitizer/or.ll     |   53 +-
 .../MC/AArch64/directives-case_insensitive.s  |    4 +-
 llvm/test/MC/AArch64/reloc-directive-err.s    |    4 +-
 llvm/test/MC/AArch64/reloc-directive.s        |   44 +-
 llvm/test/MC/AMDGPU/fixup64.s                 |   27 +
 llvm/test/MC/AMDGPU/gfx10_asm_vop3.s          |   24 -
 llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s        |   16 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s       |  132 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s |  405 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s        |  429 +
 .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s |  504 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s  |  536 +
 .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s  |  108 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s   |  140 +
 llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s        |  152 +
 .../gfx1250_asm_vop3_from_vop1-fake16.s       |  405 +
 .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s    |  429 +
 .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s |  504 +
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s |  536 +
 .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s  |  144 +
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s  |  176 +
 llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s    | 1234 +++
 .../test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s |  384 +
 llvm/test/MC/AMDGPU/gfx7_err_pos.s            |   13 -
 llvm/test/MC/AMDGPU/gfx8_err_pos.s            |   10 -
 llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s       |   24 -
 llvm/test/MC/AMDGPU/reloc-directive.s         |   58 +-
 .../MC/ARM/AlignedBundling/group-bundle-arm.s |   48 -
 .../illegal-subtarget-change.s                |   16 -
 .../test/MC/ARM/AlignedBundling/lit.local.cfg |    2 -
 .../AlignedBundling/pad-align-to-bundle-end.s |   41 -
 .../MC/ARM/AlignedBundling/subtarget-change.s |   33 -
 llvm/test/MC/ARM/arm_instructions.s           |    6 -
 llvm/test/MC/ARM/reloc-directive-err.s        |    4 +-
 llvm/test/MC/ARM/reloc-directive.s            |   35 +-
 llvm/test/MC/AVR/reloc-directive-err.s        |    8 +-
 llvm/test/MC/AVR/reloc-directive.s            |   29 +-
 .../AsmParser/AArch64/directive-parse-err.s   |    7 +-
 .../MC/Disassembler/AMDGPU/gfx10_vop3.txt     |   24 -
 .../Disassembler/AMDGPU/gfx1250_dasm_sop1.txt |   12 +
 .../AMDGPU/gfx1250_dasm_vflat.txt             | 2826 ++++++
 .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt |  552 ++
 .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt        |  514 +
 .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt         |  132 +
 .../Disassembler/AMDGPU/gfx1250_dasm_vop2.txt |  111 +
 .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt    |  557 ++
 .../gfx1250_dasm_vop3_from_vop1_dpp16.txt     |  526 +-
 .../gfx1250_dasm_vop3_from_vop1_dpp8.txt      |  172 +
 .../AMDGPU/gfx1250_dasm_wmma_w32.txt          |  734 ++
 .../test/MC/Disassembler/AMDGPU/gfx9_vop3.txt |   24 -
 llvm/test/MC/ELF/mc-dump.s                    |   22 +-
 llvm/test/MC/ELF/reloc-directive.s            |   51 +-
 .../Relocations/reloc-directive-err.s         |    4 +-
 .../LoongArch/Relocations/reloc-directive.s   |   32 +-
 llvm/test/MC/Mips/nacl-mask.s                 |  319 -
 llvm/test/MC/Mips/reloc-directive-bad-obj.s   |    4 +-
 llvm/test/MC/Mips/reloc-directive-bad.s       |    4 +-
 .../MC/Mips/reloc-directive-label-offset.s    |    8 +-
 llvm/test/MC/Mips/reloc-directive.s           |   95 +-
 llvm/test/MC/PowerPC/ppc32-reloc-directive.s  |   38 +-
 llvm/test/MC/PowerPC/ppc64-reloc-directive.s  |   42 +-
 llvm/test/MC/RISCV/Relocations/mc-dump.s      |    2 +-
 llvm/test/MC/RISCV/reloc-directive-err.s      |    4 +-
 llvm/test/MC/RISCV/reloc-directive.s          |   36 +-
 llvm/test/MC/RISCV/rv32p-valid.s              |    5 +-
 llvm/test/MC/RISCV/rv64p-valid.s              |    8 +-
 llvm/test/MC/RISCV/xqci-fixups.s              |   36 +
 llvm/test/MC/RISCV/xqcibi-relocations.s       |   15 +-
 llvm/test/MC/RISCV/xqcilb-relocations.s       |   17 +-
 llvm/test/MC/RISCV/xqcili-relocations.s       |   17 +-
 llvm/test/MC/RISCV/xqcilia-valid.s            |   40 +
 .../MC/Sparc/Relocations/reloc-directive.s    |   32 +-
 llvm/test/MC/SystemZ/reloc-directive.s        |   42 +-
 .../align-mode-argument-error.s               |    8 -
 .../asm-printing-bundle-directives.s          |   22 -
 .../autogen-inst-offset-align-to-end.s        | 2899 ------
 .../autogen-inst-offset-padding.s             | 2674 -----
 .../bundle-group-too-large-error.s            |   18 -
 .../bundle-lock-option-error.s                |   11 -
 .../bundle-subtarget-change-error.s           |   16 -
 .../X86/AlignedBundling/different-sections.s  |   27 -
 .../test/MC/X86/AlignedBundling/labeloffset.s |   85 -
 .../test/MC/X86/AlignedBundling/lit.local.cfg |    2 -
 .../lock-without-bundle-mode-error.s          |   10 -
 .../MC/X86/AlignedBundling/long-nop-pad.s     |   31 -
 .../AlignedBundling/misaligned-bundle-group.s |   19 -
 .../X86/AlignedBundling/misaligned-bundle.s   |   26 -
 llvm/test/MC/X86/AlignedBundling/nesting.s    |   73 -
 .../AlignedBundling/pad-align-to-bundle-end.s |   35 -
 .../X86/AlignedBundling/pad-bundle-groups.s   |   49 -
 .../X86/AlignedBundling/relax-at-bundle-end.s |   18 -
 .../AlignedBundling/relax-in-bundle-group.s   |   44 -
 .../MC/X86/AlignedBundling/rodata-section.s   |   30 -
 .../X86/AlignedBundling/section-alignment.s   |   23 -
 .../AlignedBundling/single-inst-bundling.s    |   52 -
 .../switch-section-locked-error.s             |   16 -
 .../unlock-without-lock-error.s               |   11 -
 llvm/test/MC/X86/align-branch-bundle.s        |   21 -
 llvm/test/MC/X86/reloc-directive-elf-32.s     |   33 +-
 llvm/test/MC/X86/reloc-directive-elf-64.s     |   39 +-
 llvm/test/MC/X86/reloc-directive.s            |   10 +-
 .../test/Other/opt-bisect-new-pass-manager.ll |   76 +-
 llvm/test/Other/opt-disable.ll                |   91 +
 llvm/test/TableGen/RuntimeLibcallEmitter.td   |   45 +-
 llvm/test/TableGen/directive1.td              |   25 +
 llvm/test/TableGen/directive2.td              |   25 +
 .../coro-async-addr-lifetime-start-bug.ll     |    6 +-
 .../DeadStoreElimination/zeroed-missing.ll    |    2 +-
 .../Transforms/InferFunctionAttrs/no-proto.ll |    3 +
 .../InstCombine/AMDGPU/ptr-replace-alloca.ll  |   33 +
 .../fold-icmp-without-constant-into-phi.ll    |  159 +
 llvm/test/Transforms/InstCombine/freeze.ll    |   11 +
 .../Transforms/InstCombine/icmp-select.ll     |    7 +-
 .../Transforms/InstCombine/ptrauth-call.ll    |   97 +
 .../InstCombine/ptrauth-intrinsics-call.ll    |  142 +
 .../InstCombine/select-fixed-zero.ll          |  221 +
 llvm/test/Transforms/InstCombine/select.ll    |    7 +-
 .../InstSimplify/ConstProp/atan-intrinsic.ll  |    3 +-
 .../ConstProp/sinh-cosh-intrinsics.ll         |    6 +-
 .../InstSimplify/fold-intrinsics.ll           |  199 +
 llvm/test/Transforms/InstSimplify/sincos.ll   |    9 +-
 .../AArch64/fixed-deinterleave-intrinsics.ll  |   32 +-
 .../scalable-deinterleave-intrinsics.ll       |   36 +-
 .../AArch64/sve-deinterleave4.ll              |   27 +-
 .../AArch64/sve-interleaved-accesses.ll       |    4 +
 .../InterleavedAccess/RISCV/addrspace.ll      |    2 +-
 .../RISCV/interleaved-accesses.ll             |   56 +-
 .../LoopInterchange/fp-reductions.ll          |  437 +
 .../reductions-non-wrapped-operations.ll      |  335 +
 .../reductions-with-nowraps.ll                |  144 +
 .../LoopUnroll/AArch64/apple-unrolling.ll     |  315 +
 .../LoopUnroll/partial-unroll-reductions.ll   |  446 +
 .../LoopUnroll/runtime-unroll-reductions.ll   |  238 +
 .../AArch64/conditional-branches-cost.ll      |    4 +-
 .../AArch64/fmin-without-fast-math-flags.ll   |   74 +
 .../LoopVectorize/AArch64/induction-costs.ll  |    6 +-
 .../AArch64/loop-vectorization-factors.ll     |    2 +-
 .../partial-reduce-dot-product-neon.ll        |    6 +-
 .../AArch64/partial-reduce-dot-product.ll     |   12 +-
 .../LoopVectorize/AArch64/partial-reduce.ll   |  256 +
 .../ARM/mve-reg-pressure-vmla.ll              |    4 +-
 .../first-order-recurrence-scalable-vf1.ll    |    6 +-
 ...ce-tail-with-evl-fixed-order-recurrence.ll |    8 +-
 .../LoopVectorize/X86/induction-costs.ll      |    2 +-
 .../first-order-recurrence-chains.ll          |   80 +-
 ...irst-order-recurrence-dead-instructions.ll |    2 +-
 .../first-order-recurrence-scalable-vf1.ll    |    6 +-
 .../LoopVectorize/first-order-recurrence.ll   |   12 +-
 .../fmin-without-fast-math-flags.ll           |  435 +
 llvm/test/Transforms/LoopVectorize/optsize.ll |    2 +-
 .../reuse-lcssa-phi-scev-expansion.ll         |    4 +-
 .../scalable-first-order-recurrence.ll        |   24 +-
 .../LoopVectorize/vplan-printing.ll           |    2 +-
 .../PhaseOrdering/AArch64/quant_4x4.ll        |   10 +-
 .../PhaseOrdering/ARM/arm_mult_q15.ll         |    2 +-
 .../SLPVectorizer/AArch64/ext-trunc.ll        |    5 +-
 .../AArch64/external-use-icmp.ll              |   23 +-
 .../extract-subvector-long-input.ll           |    3 +-
 .../AArch64/extractelements-to-shuffle.ll     |   13 +-
 ...ather-buildvector-with-minbitwidth-user.ll |   56 +-
 .../SLPVectorizer/AArch64/getelementptr.ll    |   92 +-
 .../SLPVectorizer/AArch64/getelementptr2.ll   |    2 +-
 .../AArch64/memory-runtime-checks.ll          |   18 +-
 .../AArch64/multiple_reduction.ll             |  633 +-
 .../{ => AArch64}/phi-node-bitwidt-op-not.ll  |    3 +-
 .../SLPVectorizer/AArch64/splat-loads.ll      |   20 +-
 .../AArch64/unreachable-blocks-with-phis.ll   |    8 +-
 .../AArch64/vector-getelementptr.ll           |   43 +-
 .../vectorizable-selects-uniform-cmps.ll      |    2 +-
 .../SLPVectorizer/RISCV/segmented-loads.ll    |   31 +
 .../X86/extract-subvector-long-input.ll       |   86 +
 .../icmp-altopcode-after-reordering.ll        |    3 +-
 .../X86/matched-nodes-updated.ll              |   44 +-
 .../X86/phi-node-bitwidt-op-not.ll            |   95 +
 .../alternate-opcode-sindle-bv.ll             |    8 +-
 .../Transforms/SafeStack/NVPTX/lit.local.cfg  |    2 +
 .../NVPTX/safestack-libcall-error.ll          |   12 +
 ...safestack-pointer-address-libcall-error.ll |   12 +
 .../SimplifyCFG/ARM/phi-eliminate.ll          |   12 +-
 .../Transforms/SimplifyCFG/switch-dup-bbs.ll  |   41 +
 .../SimplifyCFG/switch-range-to-icmp.ll       |    2 +-
 .../cross-dso-cfi-stack-chk-fail.ll           |   34 +
 .../StackProtector/stack-chk-fail-alias.ll    |   22 +
 .../load-extractelement-scalarization.ll      |   16 +-
 llvm/test/lit.cfg.py                          |   10 +-
 llvm/test/lit.site.cfg.py.in                  |    2 +-
 .../tools/llc/new-pm/start-stop-inserted.ll   |   15 +
 llvm/test/tools/llc/new-pm/start-stop.ll      |    2 +-
 .../llvm-exegesis/AArch64/setReg_init_check.s |   16 +-
 llvm/test/tools/llvm-ir2vec/embeddings.ll     |   73 +
 llvm/test/tools/llvm-ir2vec/triplets.ll       |   38 +
 .../tools/llvm-mca/RISCV/SpacemitX60/atomic.s |  188 +-
 .../RISCV/SpacemitX60/floating-point.s        |  182 +-
 .../llvm-mca/RISCV/SpacemitX60/integer.s      |  244 +-
 .../RISCV/SpacemitX60/rvv-arithmetic.s        | 6820 +++++++++++++
 .../llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s  | 4328 ++++++++
 .../RISCV/SpacemitX60/rvv-comparison.s        | 2704 +++++
 .../RISCV/SpacemitX60/rvv-conversion.s        | 1757 ++++
 .../llvm-mca/RISCV/SpacemitX60/rvv-fma.s      | 2185 +++++
 .../tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s | 5599 +++++++++++
 .../llvm-mca/RISCV/SpacemitX60/rvv-mask.s     | 1864 ++++
 .../llvm-mca/RISCV/SpacemitX60/rvv-minmax.s   | 1108 +++
 .../llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s  | 2984 ++++++
 .../RISCV/SpacemitX60/rvv-permutation.s       | 3504 +++++++
 .../RISCV/SpacemitX60/rvv-reduction.s         | 1824 ++++
 .../ELF/binary-output-target.test             |    6 +
 .../ELF/strip-preserve-atime.test             |    2 +
 .../llvm-objdump/ELF/RISCV/rv32-plt.test      |   44 +
 .../llvm-objdump/ELF/RISCV/rv64-plt.test      |   44 +
 .../llvm-objdump/ELF/program-headers.test     |  119 +-
 .../acceptance-test.test                      |   70 +
 .../llvm-original-di-preservation/basic.test  |   12 +-
 .../tools/llvm-readobj/ELF/section-types.test |    5 +
 .../tools/yaml2obj/ELF/program-header.yaml    |    4 +
 llvm/tools/llvm-ir2vec/CMakeLists.txt         |   10 +
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp        |  314 +
 llvm/tools/llvm-objcopy/ObjcopyOptions.cpp    |    2 +
 llvm/tools/llvm-objdump/ELFDump.cpp           |    3 +
 llvm/tools/llvm-readobj/ELFDumper.cpp         |    1 +
 llvm/tools/llvm-shlib/CMakeLists.txt          |    9 +
 llvm/unittests/Analysis/IR2VecTest.cpp        |   63 +
 llvm/unittests/Analysis/ValueTrackingTest.cpp |    4 +-
 llvm/unittests/BinaryFormat/CMakeLists.txt    |    1 +
 llvm/unittests/BinaryFormat/SFrameTest.cpp    |  118 +
 .../ExecutionEngine/JITLink/AArch32Tests.cpp  |   31 +-
 .../OpenMPDirectiveNameParserTest.cpp         |   21 +-
 .../Frontend/OpenMPIRBuilderTest.cpp          |   59 +-
 .../FuzzMutate/RandomIRBuilderTest.cpp        |    2 +-
 llvm/unittests/IR/InstructionsTest.cpp        |    4 +-
 llvm/unittests/IR/IntrinsicsTest.cpp          |   26 +
 llvm/unittests/Object/ELFObjectFileTest.cpp   |    8 +-
 .../TargetParser/RISCVISAInfoTest.cpp         |    1 +
 .../TargetParser/TargetParserTest.cpp         |    4 -
 .../Transforms/Utils/DebugifyTest.cpp         |    9 +-
 llvm/unittests/Transforms/Utils/LocalTest.cpp |   70 +-
 .../utils/TableGen/Basic/DirectiveEmitter.cpp |   22 +-
 .../TableGen/Basic/RuntimeLibcallsEmitter.cpp |   27 +-
 .../TableGen/Common/CodeGenRegisters.cpp      |   30 +-
 llvm/utils/TableGen/Common/CodeGenRegisters.h |    4 +-
 llvm/utils/TableGen/CompressInstEmitter.cpp   |   10 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        |  126 +-
 llvm/utils/TableGen/RegisterInfoEmitter.cpp   |    2 +-
 .../gn/secondary/clang/lib/Basic/BUILD.gn     |    3 +-
 .../gn/secondary/clang/lib/CodeGen/BUILD.gn   |    1 -
 .../gn/secondary/clang/lib/Driver/BUILD.gn    |    1 -
 .../gn/secondary/clang/lib/Headers/BUILD.gn   |    2 +
 .../secondary/clang/tools/libclang/BUILD.gn   |    1 +
 .../gn/secondary/libcxx/include/BUILD.gn      |    4 +-
 llvm/utils/gn/secondary/lld/MachO/BUILD.gn    |    1 +
 .../Plugins/Language/CPlusPlus/BUILD.gn       |    4 +-
 .../lib/Target/Mips/MCTargetDesc/BUILD.gn     |    1 -
 .../llvm/unittests/BinaryFormat/BUILD.gn      |    1 +
 llvm/utils/gn/secondary/llvm/version.gni      |    2 +-
 llvm/utils/lit/lit/__init__.py                |    2 +-
 llvm/utils/llvm-original-di-preservation.py   |  154 +-
 llvm/utils/mlgo-utils/mlgo/__init__.py        |    2 +-
 llvm/utils/release/export.sh                  |    2 +-
 llvm/utils/remote-exec.py                     |   11 +-
 mlir/docs/DefiningDialects/Operations.md      |   10 +-
 mlir/include/mlir-c/Pass.h                    |    4 +
 .../Analysis/Presburger/IntegerRelation.h     |   13 +
 mlir/include/mlir/Analysis/SliceAnalysis.h    |   10 +-
 .../mlir/Bindings/Python/NanobindAdaptors.h   |  160 +-
 .../ComplexToROCDLLibraryCalls.h              |   27 +
 mlir/include/mlir/Conversion/Passes.h         |    1 +
 mlir/include/mlir/Conversion/Passes.td        |   12 +
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |   16 +-
 .../Dialect/Affine/Analysis/AffineAnalysis.h  |   16 +-
 .../Bufferization/IR/BufferizationOps.td      |    2 +-
 .../include/mlir/Dialect/LLVMIR/LLVMDialect.h |    3 +
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |   65 +
 .../Linalg/TransformOps/LinalgTransformOps.td |   11 +-
 .../Dialect/Linalg/Transforms/Transforms.h    |    3 +-
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       |   60 +-
 .../Dialect/Mesh/Transforms/Simplifications.h |    8 +-
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h   |    1 +
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |   12 +-
 .../mlir/Dialect/OpenMP/OpenMPClauses.td      |   51 +
 .../mlir/Dialect/OpenMP/OpenMPEnums.td        |   30 +
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |   23 +
 .../mlir/Dialect/SPIRV/IR/SPIRVBase.td        |   15 +-
 .../Dialect/SPIRV/IR/SPIRVStructureOps.td     |   78 +
 .../Dialect/Tosa/IR/TosaComplianceData.h.inc  |    2 -
 .../mlir/Dialect/Vector/IR/VectorOps.td       |   55 +-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |   49 +-
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   11 +
 mlir/include/mlir/IR/CommonAttrConstraints.td |    4 +
 mlir/include/mlir/IR/OpBase.td                |    2 +-
 mlir/include/mlir/IR/PatternMatch.h           |   19 +
 .../include/mlir/Target/LLVMIR/ModuleImport.h |    5 +
 .../mlir/Target/LLVMIR/ModuleTranslation.h    |   11 +
 .../Analysis/Presburger/IntegerRelation.cpp   |   38 +
 mlir/lib/Analysis/SliceAnalysis.cpp           |   65 +-
 mlir/lib/Bindings/Python/IRCore.cpp           |   22 +-
 mlir/lib/Bindings/Python/IRModule.h           |   14 +-
 mlir/lib/Bindings/Python/Pass.cpp             |   18 +-
 mlir/lib/CAPI/IR/Pass.cpp                     |    4 +
 mlir/lib/Conversion/CMakeLists.txt            |    1 +
 .../ComplexToROCDLLibraryCalls/CMakeLists.txt |   18 +
 .../ComplexToROCDLLibraryCalls.cpp            |   92 +
 .../FileLineColLocBreakpointManager.cpp       |    2 -
 mlir/lib/Debug/ExecutionContext.cpp           |    2 -
 mlir/lib/Debug/Observers/ActionProfiler.cpp   |    2 -
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |    4 +
 mlir/lib/Dialect/Affine/Analysis/Utils.cpp    |   33 +-
 .../Dialect/Arith/IR/ArithCanonicalization.td |    4 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |  119 +-
 .../TransformOps/LinalgTransformOps.cpp       |    3 +-
 .../Linalg/Transforms/TilingInterfaceImpl.cpp |   53 +-
 .../Linalg/Transforms/Vectorization.cpp       |   58 +-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  112 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   15 +
 mlir/lib/Dialect/PDL/IR/PDL.cpp               |    1 -
 mlir/lib/Dialect/PDLInterp/IR/PDLInterp.cpp   |    1 -
 mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp          |    2 -
 mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp        |    2 -
 mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp          |    1 -
 .../Dialect/Quant/IR/QuantDialectBytecode.cpp |    2 -
 mlir/lib/Dialect/Quant/IR/QuantTypes.cpp      |    3 -
 .../Quant/Transforms/NormalizeQuantTypes.cpp  |    1 -
 .../Quant/Transforms/StripFuncQuantTypes.cpp  |    5 -
 mlir/lib/Dialect/SCF/IR/SCF.cpp               |    1 -
 .../SCF/TransformOps/SCFTransformOps.cpp      |    3 -
 .../BufferDeallocationOpInterfaceImpl.cpp     |    1 -
 .../BufferizableOpInterfaceImpl.cpp           |    1 -
 .../SCF/Transforms/LoopSpecialization.cpp     |    1 -
 .../SCF/Transforms/ParallelLoopCollapsing.cpp |    2 -
 .../SCF/Transforms/UpliftWhileToFor.cpp       |    2 -
 .../SCF/Utils/AffineCanonicalizationUtils.cpp |    2 -
 mlir/lib/Dialect/SCF/Utils/Utils.cpp          |    3 -
 mlir/lib/Dialect/SMT/IR/SMTAttributes.cpp     |    1 -
 mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp        |  101 +
 mlir/lib/Dialect/Shape/IR/Shape.cpp           |    1 -
 .../BufferizableOpInterfaceImpl.cpp           |    1 -
 .../Transforms/OutlineShapeComputation.cpp    |    3 -
 .../Shape/Transforms/ShapeToShapeLowering.cpp |    1 -
 .../Dialect/Tosa/IR/TosaCanonicalizations.cpp |    6 +-
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          |    3 +-
 .../Tosa/Transforms/TosaProfileCompliance.cpp |   18 +-
 mlir/lib/Dialect/Traits.cpp                   |    1 -
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp    |    1 -
 .../IR/ScalableValueBoundsConstraintSet.cpp   |    1 -
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |   40 +-
 .../TransformOps/VectorTransformOps.cpp       |    2 -
 .../Vector/Transforms/LowerVectorContract.cpp |    9 -
 .../Vector/Transforms/LowerVectorGather.cpp   |    7 -
 .../Vector/Transforms/LowerVectorScan.cpp     |   10 -
 .../Vector/Transforms/LowerVectorTransfer.cpp |    3 -
 .../Transforms/LowerVectorTranspose.cpp       |    1 -
 .../Vector/Transforms/VectorDistribute.cpp    |  188 +-
 .../Transforms/VectorMaskElimination.cpp      |    2 -
 .../VectorTransferSplitRewritePatterns.cpp    |    7 -
 .../Dialect/X86Vector/IR/X86VectorDialect.cpp |    3 -
 .../X86Vector/Transforms/AVXTranspose.cpp     |    1 -
 .../Transforms/LegalizeForLLVMExport.cpp      |    2 -
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |    1 -
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  121 +-
 .../XeGPU/Transforms/XeGPUBlocking.cpp        |    2 -
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |    1 -
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  163 +-
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  |    4 -
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |   25 +
 mlir/lib/IR/AttributeDetail.h                 |   29 +-
 mlir/lib/IR/CMakeLists.txt                    |    1 +
 mlir/lib/IR/MLIRContext.cpp                   |    1 +
 mlir/lib/IR/PatternLoggingListener.cpp        |   50 +
 mlir/lib/Interfaces/ControlFlowInterfaces.cpp |    2 -
 mlir/lib/Interfaces/LoopLikeInterface.cpp     |    1 -
 mlir/lib/Interfaces/SideEffectInterfaces.cpp  |    1 -
 mlir/lib/Pass/Pass.cpp                        |    4 -
 mlir/lib/Pass/PassCrashRecovery.cpp           |   12 +-
 mlir/lib/Pass/PassTiming.cpp                  |    1 -
 mlir/lib/Query/Matcher/ErrorBuilder.cpp       |    1 -
 mlir/lib/Query/Query.cpp                      |    1 -
 mlir/lib/Reducer/ReductionNode.cpp            |    1 -
 mlir/lib/Reducer/ReductionTreePass.cpp        |    3 -
 mlir/lib/Rewrite/FrozenRewritePatternSet.cpp  |    1 -
 mlir/lib/Rewrite/PatternApplicator.cpp        |   16 +-
 mlir/lib/Support/Timing.cpp                   |    2 -
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        |   24 +-
 .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp  |    1 +
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   35 -
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       |   48 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |   34 +-
 .../SPIRV/Deserialization/DeserializeOps.cpp  |   24 +-
 .../SPIRV/Deserialization/Deserializer.cpp    |   97 +-
 .../SPIRV/Deserialization/Deserializer.h      |   38 +
 .../SPIRV/Serialization/SerializeOps.cpp      |   42 +
 .../Target/SPIRV/Serialization/Serializer.cpp |   55 +
 .../Target/SPIRV/Serialization/Serializer.h   |   20 +
 mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp        |    2 -
 mlir/lib/Tools/PDLL/ODS/Context.cpp           |    1 -
 mlir/lib/Tools/PDLL/ODS/Dialect.cpp           |    2 -
 mlir/lib/Tools/PDLL/Parser/Parser.cpp         |    1 -
 mlir/lib/Tools/lsp-server-support/Logging.cpp |    1 -
 .../lib/Tools/lsp-server-support/Protocol.cpp |    5 -
 .../Tools/lsp-server-support/Transport.cpp    |    1 -
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp       |    7 -
 .../Tools/mlir-pdll-lsp-server/LSPServer.cpp  |    2 -
 .../Tools/mlir-pdll-lsp-server/Protocol.cpp   |    6 -
 mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp |    3 -
 mlir/lib/Tools/mlir-translate/Translation.cpp |    2 -
 .../lib/Tools/tblgen-lsp-server/LSPServer.cpp |    2 -
 .../Transforms/Utils/DialectConversion.cpp    |   86 +-
 mlir/python/mlir/_mlir_libs/_mlir/ir.pyi      |    7 +
 .../mlir/_mlir_libs/_mlir/passmanager.pyi     |    1 +
 .../Conversion/AMDGPUToROCDL/load_lds.mlir    |    7 +-
 .../complex-to-rocdl-library-calls.mlir       |   26 +
 .../VectorToXeGPU/load-to-xegpu.mlir          |    2 +-
 .../VectorToXeGPU/store-to-xegpu.mlir         |    2 +-
 .../VectorToXeGPU/transfer-read-to-xegpu.mlir |    2 +-
 .../transfer-write-to-xegpu.mlir              |    2 +-
 mlir/test/Dialect/AMDGPU/invalid.mlir         |    8 +
 mlir/test/Dialect/AMDGPU/ops.mlir             |   11 +
 mlir/test/Dialect/Affine/slicing-utils.mlir   |   23 +
 mlir/test/Dialect/Arith/canonicalize.mlir     |   12 +
 mlir/test/Dialect/LLVMIR/ifunc.mlir           |   40 +
 mlir/test/Dialect/LLVMIR/invalid.mlir         |   27 +
 .../Linalg/vectorization/linalg-ops.mlir      |  168 +-
 mlir/test/Dialect/MemRef/invalid.mlir         |   18 +
 mlir/test/Dialect/MemRef/ops.mlir             |   11 +
 mlir/test/Dialect/Mesh/simplifications.mlir   |   12 +
 mlir/test/Dialect/OpenMP/invalid.mlir         |   24 +
 mlir/test/Dialect/OpenMP/ops.mlir             |   33 +
 mlir/test/Dialect/SPIRV/IR/availability.mlir  |   14 +
 mlir/test/Dialect/SPIRV/IR/structure-ops.mlir |   87 +
 mlir/test/Dialect/Tosa/canonicalize.mlir      |   11 +
 mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir |    9 +
 mlir/test/Dialect/Vector/canonicalize.mlir    |   15 +
 mlir/test/Dialect/Vector/invalid.mlir         |   18 +
 mlir/test/Dialect/Vector/ops.mlir             |   10 +
 .../Vector/vector-warp-distribute.mlir        |  108 +
 mlir/test/Dialect/XeGPU/invalid.mlir          |   29 +-
 mlir/test/Dialect/XeGPU/ops.mlir              |   45 +-
 .../Dialect/XeGPU/subgroup-distribute.mlir    |   16 +-
 .../IR/test-pattern-logging-listener.mlir     |   24 +
 .../tile-and-fuse-consumer.mlir               |  451 +-
 mlir/test/Target/LLVMIR/Import/ifunc.ll       |   63 +
 mlir/test/Target/LLVMIR/ifunc.mlir            |   70 +
 .../LLVMIR/openmp-composite-simd-if.mlir      |   97 +
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      |    2 -
 mlir/test/Target/LLVMIR/openmp-reduction.mlir |    9 +-
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |   40 -
 mlir/test/Target/SPIRV/constant.mlir          |  105 +-
 mlir/test/Target/SPIRV/spec-constant.mlir     |   27 +
 mlir/test/lit.cfg.py                          |   11 +
 mlir/test/python/dialects/python_test.py      |    8 +-
 mlir/test/python/ir/operation.py              |    9 +
 .../python/lib/PythonTestModuleNanobind.cpp   |    6 +-
 mlir/test/python/pass_manager.py              |   57 +
 .../Presburger/IntegerRelationTest.cpp        |   94 +
 .../Dialect/OpenACC/OpenACCOpsTest.cpp        |   44 +-
 mlir/unittests/IR/CMakeLists.txt              |    1 +
 .../IR/DistinctAttributeAllocatorTest.cpp     |   45 +
 offload/liboffload/src/OffloadImpl.cpp        |   57 +-
 .../OffloadAPI/symbol/olGetSymbol.cpp         |   18 +
 third-party/benchmark/src/sysinfo.cc          |    4 +-
 .../llvm-project-overlay/clang/BUILD.bazel    |    7 +
 .../llvm-project-overlay/libc/BUILD.bazel     |  155 +-
 .../libc/test/src/math/BUILD.bazel            |    8 +
 .../libc/test/src/math/smoke/BUILD.bazel      |    2 +
 .../llvm-project-overlay/mlir/BUILD.bazel     |   34 +-
 2305 files changed, 132980 insertions(+), 59132 deletions(-)
 create mode 100644 bolt/test/AArch64/veneer-bolt.s
 create mode 100644 clang-tools-extra/test/clang-query/trailing-comma.c
 create mode 100644 clang/docs/CXXTypeAwareAllocators.rst
 create mode 100644 clang/include/clang/Basic/BuiltinsRISCVXAndes.td
 create mode 100644 clang/include/clang/CIR/Dialect/IR/CIREnumAttr.td
 delete mode 100644 clang/lib/Basic/Targets/PNaCl.cpp
 delete mode 100644 clang/lib/Basic/Targets/PNaCl.h
 create mode 100644 clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
 delete mode 100644 clang/lib/CodeGen/Targets/PNaCl.cpp
 delete mode 100644 clang/lib/Driver/ToolChains/NaCl.cpp
 delete mode 100644 clang/lib/Driver/ToolChains/NaCl.h
 create mode 100644 clang/lib/Headers/cuda_wrappers/__utility/declval.h
 create mode 100644 clang/lib/Headers/riscv_nds.h
 create mode 100644 clang/test/Analysis/div-zero-cxx20.cpp
 create mode 100644 clang/test/CIR/CodeGen/complex-unary.cpp
 create mode 100644 clang/test/CodeGen/PowerPC/ppc-dmf-future-builtin-err.c
 rename clang/test/CodeGen/PowerPC/{ppc-future-mma-builtin-err.c => ppc-dmf-mma-builtin-err.c} (75%)
 create mode 100644 clang/test/CodeGen/RISCV/riscv-xandesbfhcvt-c-api.c
 create mode 100644 clang/test/CodeGen/RISCV/riscv-xandesbfhcvt.c
 rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/non-overloaded/{vcpopv.c => vcpop.c} (100%)
 rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/overloaded/{vcpopv.c => vcpop.c} (100%)
 rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/non-overloaded/{vcpopv.c => vcpop.c} (100%)
 rename clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/overloaded/{vcpopv.c => vcpop.c} (100%)
 create mode 100644 clang/test/CodeGen/X86/i128-debuginfo.c
 delete mode 100644 clang/test/CodeGen/X86/x86_64-arguments-nacl.c
 delete mode 100644 clang/test/CodeGen/malign-double-x86-nacl.c
 create mode 100644 clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp
 delete mode 100644 clang/test/CodeGenCXX/x86_64-arguments-nacl-x32.cpp
 create mode 100644 clang/test/CodeGenHLSL/builtins/refract.hlsl
 create mode 100644 clang/test/CodeGenObjC/gnustep2-class-exts.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-attr-exception.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-block-isa.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-class-ro.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-class.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-objc-interface-selector.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-objc-isa-super.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-objc-method-list-pointer.m
 create mode 100644 clang/test/CodeGenObjC/ptrauth-property-backing.m
 create mode 100644 clang/test/CodeGenObjCXX/ptrauth-objc-interface-selector.mm
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
 create mode 100644 clang/test/CodeGenOpenCL/scoped-atomic.cl
 create mode 100644 clang/test/CodeGenSPIRV/Builtins/refract.c
 create mode 100644 clang/test/DebugInfo/KeyInstructions/array-cookie.cpp
 create mode 100644 clang/test/Driver/DTLTO/dtlto.c
 delete mode 100644 clang/test/Driver/nacl-direct.c
 delete mode 100644 clang/test/Driver/x86_64-nacl-defines.cpp
 delete mode 100644 clang/test/Frontend/x86_64-nacl-types.cpp
 create mode 100644 clang/test/Parser/gh114815.cpp
 create mode 100644 clang/test/SemaCXX/noreturn-vars.cpp
 create mode 100644 clang/test/SemaCXX/warn-uninitialized-const-pointer.cpp
 create mode 100644 clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
 create mode 100644 clang/test/SemaObjC/ptrauth-pointers.m
 create mode 100644 clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp
 create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
 create mode 100644 clang/test/SemaSPIRV/BuiltIns/refract-errors.c
 create mode 100644 clang/test/SemaTemplate/gh138371.cpp
 create mode 100644 clang/tools/libclang/Obsolete.cpp
 create mode 100644 flang/docs/ParallelMultiImageFortranRuntime.md
 create mode 100644 flang/test/Driver/fatal-errors-warnings.f90
 create mode 100644 flang/test/Lower/CUDA/cuda-set-allocator.cuf
 create mode 100644 flang/test/Lower/Intrinsics/cospi.f90
 create mode 100644 flang/test/Lower/OpenACC/acc-use-device.f90
 create mode 100644 flang/test/Lower/amdgcn-complex.f90
 create mode 100644 flang/test/Preprocessing/omp-sentinel-fixed-form.F
 create mode 100644 flang/test/Semantics/bind-c18.f90
 create mode 100644 flang/test/Semantics/bug148559.f90
 create mode 100644 flang/test/Semantics/bug148675.f90
 delete mode 100644 libc/fuzzing/math/atan_fuzz.cpp
 create mode 100644 libc/fuzzing/math/log10_fuzz.cpp
 create mode 100644 libc/fuzzing/math/log1p_fuzz.cpp
 create mode 100644 libc/fuzzing/math/log2_fuzz.cpp
 create mode 100644 libc/fuzzing/math/log_fuzz.cpp
 create mode 100644 libc/fuzzing/math/sqrt_fuzz.cpp
 delete mode 100644 libc/include/llvm-libc-macros/dlfcn-macros.h
 create mode 100644 libc/include/wctype.yaml
 create mode 100644 libc/shared/math/exp.h
 create mode 100644 libc/shared/math/exp10.h
 create mode 100644 libc/shared/math/exp10f.h
 create mode 100644 libc/src/__support/math/exp.h
 create mode 100644 libc/src/__support/math/exp10.h
 rename libc/src/{math/generic/exp10f_impl.h => __support/math/exp10f.h} (91%)
 create mode 100644 libc/src/__support/math/exp10f_utils.h
 create mode 100644 libc/src/__support/math/exp_constants.h
 create mode 100644 libc/src/__support/math/exp_utils.h
 create mode 100644 libc/src/__support/wchar/string_converter.h
 delete mode 100644 libc/src/math/generic/explogxf.cpp
 create mode 100644 libc/src/string/memory_utils/arm/common.h
 create mode 100644 libc/src/string/memory_utils/arm/inline_memset.h
 create mode 100644 libc/src/wctype/CMakeLists.txt
 create mode 100644 libc/src/wctype/iswalpha.cpp
 create mode 100644 libc/src/wctype/iswalpha.h
 create mode 100644 libc/test/src/__support/wchar/string_converter_test.cpp
 create mode 100644 libc/test/src/wctype/CMakeLists.txt
 create mode 100644 libc/test/src/wctype/iswalpha_test.cpp
 create mode 100644 libclc/clc/include/clc/atomic/atomic_decl.inc
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_dec.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_exchange.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_inc.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_load.h
 create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_store.h
 create mode 100644 libclc/clc/include/clc/integer/clc_bit_reverse.h
 create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc
 create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h
 create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h
 create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_insert.h
 create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_insert.inc
 delete mode 100644 libclc/clc/include/clc/relational/floatn.inc
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_dec.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_def.inc
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_inc.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_load.cl
 create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_store.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_bit_reverse.cl
 rename libc/include/dlfcn.h.def => libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl (56%)
 create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc
 create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc
 create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_insert.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_insert.inc
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_load.h
 create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_store.h
 create mode 100644 libclc/opencl/include/clc/opencl/integer/bit_reverse.h
 create mode 100644 libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h
 create mode 100644 libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h
 create mode 100644 libclc/opencl/include/clc/opencl/integer/bitfield_insert.h
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_def.inc
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_exchange.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_load.cl
 create mode 100644 libclc/opencl/lib/generic/atomic/atomic_store.cl
 create mode 100644 libclc/opencl/lib/generic/integer/bit_reverse.cl
 create mode 100644 libclc/opencl/lib/generic/integer/bitfield_extract_def.inc
 create mode 100644 libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl
 create mode 100644 libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl
 create mode 100644 libclc/opencl/lib/generic/integer/bitfield_insert.cl
 create mode 100644 libclc/opencl/lib/generic/integer/bitfield_insert.inc
 create mode 100644 libclc/utils/CMakeLists.txt
 create mode 100644 libcxx/include/__format/range_format.h
 delete mode 100644 libcxx/include/__locale_dir/locale_base_api/android.h
 create mode 100644 libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
 create mode 100644 libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
 create mode 100644 libcxx/test/extensions/libcxx/always_initialize_iterators.pass.cpp
 create mode 100644 libcxx/test/extensions/libcxx/localization/lit.local.cfg
 rename libcxx/test/{libcxx/localization/locales => extensions/libcxx/localization}/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp (100%)
 delete mode 100644 libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp
 create mode 100644 libcxx/test/libcxx/mem/mem.res/ctor.nullptr.assert.pass.cpp
 create mode 100644 libcxx/test/libcxx/mem/mem.res/nonnull.verify.cpp
 delete mode 100644 libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
 create mode 100644 libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.writefail.pass.cpp
 rename libcxx/test/{libcxx => std}/numerics/c.math/fdelayed-template-parsing.pass.cpp (100%)
 rename libcxx/test/{libcxx/numerics/rand => std/numerics/rand/rand.req}/rand.req.urng/valid_int_type.verify.cpp (100%)
 rename libcxx/test/{libcxx/numerics/rand => std/numerics/rand/rand.req}/rand.req.urng/valid_real_type.verify.cpp (100%)
 rename libcxx/test/{libcxx/memory => std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create}/shared_ptr_array.pass.cpp (94%)
 create mode 100644 lld/MachO/LinkerOptimizationHints.cpp
 create mode 100644 lld/MachO/LinkerOptimizationHints.h
 create mode 100644 lld/test/ELF/hexagon-attributes.s
 create mode 100644 lld/test/MachO/loh-arm64-32.s
 create mode 100644 lldb/examples/python/filter_disasm.py
 rename lldb/source/Plugins/Language/CPlusPlus/{LibCxxList.cpp => GenericList.cpp} (58%)
 create mode 100644 lldb/source/Plugins/Language/CPlusPlus/MsvcStlTuple.cpp
 create mode 100644 lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp
 create mode 100755 lldb/test/Shell/Commands/Inputs/dis_filt.py
 create mode 100644 lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s
 create mode 100644 lldb/test/Shell/Commands/command-disassemble-x86-bytes.s
 create mode 100644 lldb/test/Shell/Minidump/missing-memory-region.yaml
 create mode 100644 lldb/test/Shell/Settings/TestChildCountTruncation.test
 create mode 100644 lldb/test/Shell/Settings/TestChildDepthTruncation.test
 create mode 100644 llvm/clang/test/Modules/implicit-opt-level.c
 create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst
 create mode 100644 llvm/docs/KeyInstructionsDebugInfo.md
 create mode 100644 llvm/include/llvm/BinaryFormat/SFrame.h
 create mode 100644 llvm/include/llvm/CodeGen/ProcessImplicitDefs.h
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
 create mode 100644 llvm/test/CodeGen/AArch64/pr148949.ll
 create mode 100644 llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
 create mode 100644 llvm/test/CodeGen/AArch64/unsupported-object-format-err.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/known-bits-sbfe.mir
 delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/bf16-math.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/fold-commute-sgpr.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/global-address.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/literal64.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx10.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx9.mir
 delete mode 100644 llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-fma-f64.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir
 create mode 100644 llvm/test/CodeGen/ARM/stack-protector-target.ll
 delete mode 100644 llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
 create mode 100644 llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/llvm.exp10.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/llvm.frexp.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/thread-pointer.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/llvm.exp10.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/stack-protector-target.ll
 delete mode 100644 llvm/test/CodeGen/Mips/nacl-align.ll
 delete mode 100644 llvm/test/CodeGen/Mips/nacl-branch-delay.ll
 delete mode 100644 llvm/test/CodeGen/Mips/nacl-reserved-regs.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/no-stack-protector-libcall-error.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/prmt-const-folding.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/stack-protector-target.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
 create mode 100644 llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir
 create mode 100644 llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
 create mode 100644 llvm/test/CodeGen/RISCV/xqcicli.ll
 create mode 100644 llvm/test/CodeGen/SPARC/stack-protector-target.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/opencl/refract-error.ll
 create mode 100644 llvm/test/CodeGen/X86/stack-protector-target-openbsd.ll
 create mode 100644 llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
 create mode 100644 llvm/test/CodeGen/XCore/llvm.frexp.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
 create mode 100644 llvm/test/MC/AMDGPU/fixup64.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
 delete mode 100644 llvm/test/MC/ARM/AlignedBundling/group-bundle-arm.s
 delete mode 100644 llvm/test/MC/ARM/AlignedBundling/illegal-subtarget-change.s
 delete mode 100644 llvm/test/MC/ARM/AlignedBundling/lit.local.cfg
 delete mode 100644 llvm/test/MC/ARM/AlignedBundling/pad-align-to-bundle-end.s
 delete mode 100644 llvm/test/MC/ARM/AlignedBundling/subtarget-change.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
 delete mode 100644 llvm/test/MC/Mips/nacl-mask.s
 create mode 100644 llvm/test/MC/RISCV/xqci-fixups.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/align-mode-argument-error.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/asm-printing-bundle-directives.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-padding.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/bundle-group-too-large-error.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/bundle-lock-option-error.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/bundle-subtarget-change-error.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/different-sections.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/labeloffset.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/lit.local.cfg
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/lock-without-bundle-mode-error.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/long-nop-pad.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/misaligned-bundle.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/nesting.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/pad-bundle-groups.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/relax-at-bundle-end.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/rodata-section.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/section-alignment.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/single-inst-bundling.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/switch-section-locked-error.s
 delete mode 100644 llvm/test/MC/X86/AlignedBundling/unlock-without-lock-error.s
 delete mode 100644 llvm/test/MC/X86/align-branch-bundle.s
 create mode 100644 llvm/test/Other/opt-disable.ll
 create mode 100644 llvm/test/Transforms/InstCombine/fold-icmp-without-constant-into-phi.ll
 create mode 100644 llvm/test/Transforms/InstCombine/ptrauth-call.ll
 create mode 100644 llvm/test/Transforms/InstCombine/ptrauth-intrinsics-call.ll
 create mode 100644 llvm/test/Transforms/InstCombine/select-fixed-zero.ll
 create mode 100644 llvm/test/Transforms/LoopInterchange/fp-reductions.ll
 create mode 100644 llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
 create mode 100644 llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll
 create mode 100644 llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
 create mode 100644 llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
 rename llvm/test/Transforms/SLPVectorizer/{ => AArch64}/extract-subvector-long-input.ll (89%)
 rename llvm/test/Transforms/SLPVectorizer/{ => AArch64}/phi-node-bitwidt-op-not.ll (94%)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
 rename llvm/test/Transforms/SLPVectorizer/{ => X86}/icmp-altopcode-after-reordering.ll (91%)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
 create mode 100644 llvm/test/Transforms/SafeStack/NVPTX/lit.local.cfg
 create mode 100644 llvm/test/Transforms/SafeStack/NVPTX/safestack-libcall-error.ll
 create mode 100644 llvm/test/Transforms/SafeStack/NVPTX/safestack-pointer-address-libcall-error.ll
 create mode 100644 llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 create mode 100644 llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
 create mode 100644 llvm/test/tools/llc/new-pm/start-stop-inserted.ll
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll
 create mode 100644 llvm/test/tools/llvm-ir2vec/triplets.ll
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/RISCV/rv32-plt.test
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/RISCV/rv64-plt.test
 create mode 100644 llvm/test/tools/llvm-original-di-preservation/acceptance-test.test
 create mode 100644 llvm/tools/llvm-ir2vec/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
 create mode 100644 llvm/unittests/BinaryFormat/SFrameTest.cpp
 create mode 100644 mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h
 create mode 100644 mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt
 create mode 100644 mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
 create mode 100644 mlir/lib/IR/PatternLoggingListener.cpp
 create mode 100644 mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir
 create mode 100644 mlir/test/Dialect/LLVMIR/ifunc.mlir
 create mode 100644 mlir/test/IR/test-pattern-logging-listener.mlir
 create mode 100644 mlir/test/Target/LLVMIR/Import/ifunc.ll
 create mode 100644 mlir/test/Target/LLVMIR/ifunc.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-composite-simd-if.mlir
 create mode 100644 mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp

diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index c3cf714ce6c10..8e25fd61d6b32 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -19,6 +19,7 @@
 PROJECT_DEPENDENCIES = {
     "llvm": set(),
     "clang": {"llvm"},
+    "CIR": {"clang", "mlir"},
     "bolt": {"clang", "lld", "llvm"},
     "clang-tools-extra": {"clang", "llvm"},
     "compiler-rt": {"clang", "lld"},
@@ -55,6 +56,7 @@
     ".ci": {
         "llvm",
         "clang",
+        "CIR",
         "lld",
         "lldb",
         "bolt",
@@ -128,6 +130,7 @@
     "lldb": "check-lldb",
     "llvm": "check-llvm",
     "clang": "check-clang",
+    "CIR": "check-clang-cir",
     "bolt": "check-bolt",
     "lld": "check-lld",
     "flang": "check-flang",
@@ -247,6 +250,14 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]:
         # capacity.
         if len(path_parts) > 3 and path_parts[:3] == ("llvm", "utils", "gn"):
             continue
+        # If the file is in the clang/lib/CIR directory, add the CIR project.
+        if len(path_parts) > 3 and (
+            path_parts[:3] == ("clang", "lib", "CIR")
+            or path_parts[:3] == ("clang", "test", "CIR")
+            or path_parts[:4] == ("clang", "include", "clang", "CIR")
+        ):
+            modified_projects.add("CIR")
+            # Fall through to add clang.
         modified_projects.add(pathlib.Path(modified_file).parts[0])
     return modified_projects
 
@@ -267,6 +278,13 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     runtimes_check_targets_needs_reconfig = _compute_project_check_targets(
         runtimes_to_test_needs_reconfig
     )
+
+    # CIR is used as a pseudo-project in this script. It is built as part of the
+    # clang build, but it requires an explicit option to enable. We set that
+    # option here, and remove it from the projects_to_build list.
+    enable_cir = "ON" if "CIR" in projects_to_build else "OFF"
+    projects_to_build.discard("CIR")
+
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
     # (;). We use spaces to separate the check targets as they end up getting
@@ -279,6 +297,7 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
         "runtimes_check_targets_needs_reconfig": " ".join(
             sorted(runtimes_check_targets_needs_reconfig)
         ),
+        "enable_cir": enable_cir,
     }
 
 
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index 6299931e1ec34..732514c96f5a6 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -104,6 +104,10 @@ def test_clang(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(
+            env_variables["enable_cir"],
+            "OFF",
+        )
 
     def test_clang_windows(self):
         env_variables = compute_projects.get_env_variables(
@@ -126,6 +130,32 @@ def test_clang_windows(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(env_variables["enable_cir"], "OFF")
+
+    def test_cir(self):
+        env_variables = compute_projects.get_env_variables(
+            ["clang/lib/CIR/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "clang;clang-tools-extra;lld;llvm;mlir",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-clang check-clang-cir check-clang-tools",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind"
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+        self.assertEqual(env_variables["enable_cir"], "ON")
 
     def test_bolt(self):
         env_variables = compute_projects.get_env_variables(
@@ -158,6 +188,7 @@ def test_mlir(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_flang(self):
         env_variables = compute_projects.get_env_variables(
@@ -168,6 +199,7 @@ def test_flang(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_invalid_subproject(self):
         env_variables = compute_projects.get_env_variables(
@@ -237,7 +269,7 @@ def test_ci(self):
         )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-bolt check-clang check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
         )
         self.assertEqual(
             env_variables["runtimes_to_build"],
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 8d1faab13986c..6db24d894eb73 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -21,12 +21,7 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 INSTALL_DIR="${BUILD_DIR}/install"
 rm -rf "${BUILD_DIR}"
 
-ccache --zero-stats
-
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing cache"
-  ccache --clear
-fi
+sccache --zero-stats
 
 mkdir -p artifacts/reproducers
 
@@ -36,7 +31,7 @@ export CLANG_CRASH_DIAGNOSTICS_DIR=`realpath artifacts/reproducers`
 function at-exit {
   retcode=$?
 
-  ccache --print-stats > artifacts/ccache_stats.txt
+  sccache --show-stats > artifacts/sccache_stats.txt
   cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log
   cp "${BUILD_DIR}"/test-results.*.xml artifacts/ || :
 
@@ -53,6 +48,7 @@ targets="${2}"
 runtimes="${3}"
 runtime_targets="${4}"
 runtime_targets_needs_reconfig="${5}"
+enable_cir="${6}"
 
 lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
 
@@ -72,13 +68,15 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -G Ninja \
       -D CMAKE_PREFIX_PATH="${HOME}/.local" \
       -D CMAKE_BUILD_TYPE=Release \
+      -D CLANG_ENABLE_CIR=${enable_cir} \
       -D LLVM_ENABLE_ASSERTIONS=ON \
       -D LLVM_BUILD_EXAMPLES=ON \
       -D COMPILER_RT_BUILD_LIBFUZZER=OFF \
       -D LLVM_LIT_ARGS="${lit_args}" \
       -D LLVM_ENABLE_LLD=ON \
       -D CMAKE_CXX_FLAGS=-gmlt \
-      -D LLVM_CCACHE_BUILD=ON \
+      -D CMAKE_C_COMPILER_LAUNCHER=sccache \
+      -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
       -D LIBCXX_CXX_ABI=libcxxabi \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D LLDB_ENABLE_PYTHON=ON \
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 176350fac604c..50a741677d734 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -21,11 +21,6 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 
 rm -rf "${BUILD_DIR}"
 
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing sccache"
-  rm -rf "$SCCACHE_DIR"
-fi
-
 sccache --zero-stats
 function at-exit {
   retcode=$?
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c951a24fc8d31..dd7e1771c9361 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -31,9 +31,9 @@
 /clang/www/cxx_dr_status.html @Endilll
 /clang/www/make_cxx_dr_status @Endilll
 
-/clang/include/clang/CIR @lanza @bcardosolopes
-/clang/lib/CIR @lanza @bcardosolopes
-/clang/tools/cir-* @lanza @bcardosolopes
+/clang/include/clang/CIR @lanza @bcardosolopes @xlauko @andykaylor
+/clang/lib/CIR @lanza @bcardosolopes @xlauko @andykaylor
+/clang/tools/cir-* @lanza @bcardosolopes @xlauko @andykaylor
 
 /lldb/ @JDevlieghere
 
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index b05e9c6c56ed0..8e0fa8d42d735 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -48,6 +48,9 @@ flang:frontend:
   - flang/Evaluate/**/*
   - flang/Semantics/**/*
 
+libclc:
+  - libclc/**
+
 HLSL:
   - clang/*HLSL*/**/*
   - clang/**/*HLSL*
@@ -717,6 +720,8 @@ mlgo:
   - llvm/lib/Analysis/IR2Vec.cpp
   - llvm/lib/Analysis/models/**
   - llvm/test/Analysis/IR2Vec/**
+  - llvm/tools/llvm-ir2vec/**
+  - llvm/docs/CommandGuide/llvm-ir2vec.rst
 
 tools:llvm-exegesis:
   - llvm/tools/llvm-exegesis/**
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index efe08ebc221c5..69c71f638e2ac 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -63,11 +63,21 @@ RUN apt-get update && \
     python3-pip \
     ccache \
     file \
-    tzdata \
-    sccache && \
+    tzdata && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# We need sccache for caching. We cannot use the apt repository version because
+# it is too old and has bugs related to features we require (particularly GCS
+# caching), so we manually install it here.
+# TODO(boomanaiden154): We should return to installing this from the apt
+# repository once a version containing the necessary bug fixes is available.
+RUN curl -L 'https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-x86_64-unknown-linux-musl.tar.gz' > /tmp/sccache.tar.gz && \
+    echo "1fbb35e135660d04a2d5e42b59c7874d39b3deb17de56330b25b713ec59f849b /tmp/sccache.tar.gz" | sha256sum -c && \
+    tar xzf /tmp/sccache.tar.gz -O --wildcards '*/sccache' > '/usr/local/bin/sccache' && \
+    rm /tmp/sccache.tar.gz && \
+    chmod +x /usr/local/bin/sccache
+
 ENV LLVM_SYSROOT=$LLVM_SYSROOT
 ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index f7a48304b82b0..73943bc86eadd 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -37,6 +37,7 @@ jobs:
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
         with:
+          variant: "sccache"
           max-size: "2000M"
       - name: Build and Test
         # Mark the job as a success even if the step fails so that people do
@@ -61,7 +62,7 @@ jobs:
           export CC=/opt/llvm/bin/clang
           export CXX=/opt/llvm/bin/clang++
 
-          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}"
+          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}" "${enable_cir}"
       - name: Upload Artifacts
         if: '!cancelled()'
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
@@ -76,7 +77,7 @@ jobs:
     if: >-
         github.repository_owner == 'llvm' &&
         (github.event_name != 'pull_request' || github.event.action != 'closed')
-    runs-on: llvm-premerge-windows-runners
+    runs-on: llvm-premerge-windows-2022-runners
     defaults:
       run:
         shell: bash
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 91ecf89da618c..f6c4b70121d34 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -225,6 +225,9 @@ class BinaryContext {
   /// Store all functions in the binary, sorted by original address.
   std::map<uint64_t, BinaryFunction> BinaryFunctions;
 
+  /// Functions to be included in the output in the sorted order.
+  std::vector<BinaryFunction *> OutputFunctions;
+
   /// A mutex that is used to control parallel accesses to BinaryFunctions
   mutable llvm::sys::RWMutex BinaryFunctionsMutex;
 
@@ -559,6 +562,8 @@ class BinaryContext {
     return InjectedBinaryFunctions;
   }
 
+  BinaryFunction *createThunkBinaryFunction(const std::string &Name);
+
   /// Return vector with all functions, i.e. include functions from the input
   /// binary and functions created by BOLT.
   std::vector<BinaryFunction *> getAllBinaryFunctions();
@@ -1356,8 +1361,12 @@ class BinaryContext {
   unsigned addDebugFilenameToUnit(const uint32_t DestCUID,
                                   const uint32_t SrcCUID, unsigned FileIndex);
 
-  /// Return functions in output layout order
-  std::vector<BinaryFunction *> getSortedFunctions();
+  /// Return a vector of functions in the order ready for code emission.
+  /// The vector will include functions added/injected by BOLT.
+  const std::vector<BinaryFunction *> &getOutputFunctions();
+
+  /// Update function list for the output.
+  void updateOutputFunctions(std::vector<BinaryFunction *> &Functions);
 
   /// Do the best effort to calculate the size of the function by emitting
   /// its code, and relaxing branch instructions. By default, branch
@@ -1378,6 +1387,10 @@ class BinaryContext {
   uint64_t
   computeInstructionSize(const MCInst &Inst,
                          const MCCodeEmitter *Emitter = nullptr) const {
+    // FIXME: hack for faster size computation on aarch64.
+    if (isAArch64())
+      return MIB->isPseudo(Inst) ? 0 : 4;
+
     if (std::optional<uint32_t> Size = MIB->getSize(Inst))
       return *Size;
 
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index ae580520b9110..a8c750fff6321 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -368,6 +368,10 @@ class BinaryFunction {
   /// True if the function should not have an associated symbol table entry.
   bool IsAnonymous{false};
 
+  /// True if the function is used for remapping hot text and shall not be
+  /// placed on a huge page.
+  bool IsHotTextMover{false};
+
   /// Name for the section this function code should reside in.
   std::string CodeSectionName;
 
@@ -1415,6 +1419,8 @@ class BinaryFunction {
   /// Return true if the function uses ORC format for stack unwinding.
   bool hasORC() const { return HasORC; }
 
+  bool isHotTextMover() const { return IsHotTextMover; }
+
   const JumpTable *getJumpTable(const MCInst &Inst) const {
     const uint64_t Address = BC.MIB->getJumpTable(Inst);
     return getJumpTableContainingAddress(Address);
@@ -1769,6 +1775,8 @@ class BinaryFunction {
   /// Mark function that should not be emitted.
   void setIgnored();
 
+  void setHotTextMover(bool V) { IsHotTextMover = V; }
+
   void setHasIndirectTargetToSplitFragment(bool V) {
     HasIndirectTargetToSplitFragment = V;
   }
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index df3ea9620918a..f3b6c25fb706c 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -76,6 +76,39 @@ class LongJmpPass : public BinaryFunctionPass {
   /// 128MB of each other.
   void relaxLocalBranches(BinaryFunction &BF);
 
+  struct FunctionCluster {
+    // All functions in this cluster.
+    DenseSet<BinaryFunction *> Functions;
+
+    // Functions that this cluster of functions is calling. Note that it
+    // excludes all functions in the cluster itself.
+    DenseSet<BinaryFunction *> Callees;
+
+    // Estimated size of the cluster in bytes.
+    uint64_t Size{0};
+
+    // The index of the last function in the cluster. Used as an insertion point
+    // for adding thunks to the output function list.
+    size_t FirstFunctionIndex = -1;
+    size_t LastFunctionIndex = -1;
+
+    // Thunks located at the end of this cluster.
+    std::vector<BinaryFunction *> ThunkList;
+
+    // Thunks used by this cluster. Some could be in a ThunkList of the
+    // preceding cluster.
+    DenseMap<BinaryFunction *, BinaryFunction *> Thunks;
+  };
+
+  /// Maximum size of the function cluster. Note that it's less than 128MB
+  /// as the size of the cluster plus thunk island should be less than 128MB.
+  static constexpr uint64_t MaxClusterSize = 125 * 1024 * 1024;
+
+  /// Relax calls for medium code model where code is < 256MB.
+  /// A thunk island will be introduced between two clusters of functions to
+  /// enable calls over 128MB.
+  void relaxCalls(BinaryContext &BC);
+
   ///                 -- Layout estimation methods --
   /// Try to do layout before running the emitter, by looking at BinaryFunctions
   /// and MCInsts -- this is an estimation. To be correct for longjmp inserter
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 84f1853469709..e5dc4f7f6e086 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1592,14 +1592,46 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID,
                                DestCUID, DstUnit->getVersion()));
 }
 
-std::vector<BinaryFunction *> BinaryContext::getSortedFunctions() {
-  std::vector<BinaryFunction *> SortedFunctions(BinaryFunctions.size());
+const std::vector<BinaryFunction *> &BinaryContext::getOutputFunctions() {
+  assert((!HasRelocations || HasFinalizedFunctionOrder) &&
+         "Output function order not finalized");
+
+  if (!OutputFunctions.empty())
+    return OutputFunctions;
+
+  OutputFunctions.reserve(BinaryFunctions.size() +
+                          InjectedBinaryFunctions.size());
   llvm::transform(llvm::make_second_range(BinaryFunctions),
-                  SortedFunctions.begin(),
+                  std::back_inserter(OutputFunctions),
                   [](BinaryFunction &BF) { return &BF; });
 
-  llvm::stable_sort(SortedFunctions, compareBinaryFunctionByIndex);
-  return SortedFunctions;
+  llvm::erase_if(OutputFunctions,
+                 [this](BinaryFunction *BF) { return !shouldEmit(*BF); });
+
+  llvm::stable_sort(OutputFunctions,
+                    [](const BinaryFunction *A, const BinaryFunction *B) {
+                      // Place hot text movers at the start.
+                      if (A->isHotTextMover() && !B->isHotTextMover())
+                        return true;
+                      if (!A->isHotTextMover() && B->isHotTextMover())
+                        return false;
+                      if (A->hasValidIndex() && B->hasValidIndex()) {
+                        return A->getIndex() < B->getIndex();
+                      }
+                      if (opts::HotFunctionsAtEnd)
+                        return B->hasValidIndex();
+                      else
+                        return A->hasValidIndex();
+                    });
+
+  llvm::copy(InjectedBinaryFunctions, std::back_inserter(OutputFunctions));
+
+  return OutputFunctions;
+}
+
+void BinaryContext::updateOutputFunctions(
+    std::vector<BinaryFunction *> &Functions) {
+  OutputFunctions.swap(Functions);
 }
 
 std::vector<BinaryFunction *> BinaryContext::getAllBinaryFunctions() {
@@ -2385,6 +2417,10 @@ BinaryContext::createInjectedBinaryFunction(const std::string &Name,
   BinaryFunction *BF = InjectedBinaryFunctions.back();
   setSymbolToFunctionMap(BF->getSymbol(), BF);
   BF->CurrentState = BinaryFunction::State::CFG;
+
+  if (!OutputFunctions.empty())
+    OutputFunctions.push_back(BF);
+
   return BF;
 }
 
@@ -2421,6 +2457,12 @@ BinaryContext::createInstructionPatch(uint64_t Address,
   return PBF;
 }
 
+BinaryFunction *
+BinaryContext::createThunkBinaryFunction(const std::string &Name) {
+  static NameResolver NR;
+  return createInjectedBinaryFunction(NR.uniquify(Name));
+}
+
 std::pair<size_t, size_t>
 BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
   // Use the original size for non-simple functions.
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 7b5cd276fee89..68f16614be283 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -281,11 +281,7 @@ void BinaryEmitter::emitFunctions() {
   }
 
   // Emit functions in sorted order.
-  std::vector<BinaryFunction *> SortedFunctions = BC.getSortedFunctions();
-  emit(SortedFunctions);
-
-  // Emit functions added by BOLT.
-  emit(BC.getInjectedBinaryFunctions());
+  emit(BC.getOutputFunctions());
 
   // Mark the end of hot text.
   if (opts::HotText) {
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index eec68ff5a5fce..fb4503210450a 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -114,6 +114,10 @@ cl::opt<bool>
                             cl::desc("try to preserve basic block alignment"),
                             cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintOffsets("print-offsets",
+                                  cl::desc("print basic block offsets"),
+                                  cl::Hidden, cl::cat(BoltOptCategory));
+
 static cl::opt<bool> PrintOutputAddressRange(
     "print-output-address-range",
     cl::desc(
@@ -555,6 +559,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
       if (BB->isLandingPad())
         OS << "  Landing Pad\n";
 
+      if (opts::PrintOffsets && BB->getOutputStartAddress()) {
+        OS << "  OutputOffset: 0x"
+           << Twine::utohexstr(BB->getOutputStartAddress()) << '\n';
+      }
+
       uint64_t BBExecCount = BB->getExecutionCount();
       if (hasValidProfile()) {
         OS << "  Exec Count : ";
diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp
index c3ddedaaa1466..1b499aca2c141 100644
--- a/bolt/lib/Passes/Aligner.cpp
+++ b/bolt/lib/Passes/Aligner.cpp
@@ -77,6 +77,11 @@ static void alignCompact(BinaryFunction &Function,
   size_t HotSize = 0;
   size_t ColdSize = 0;
 
+  if (!Function.hasProfile() && BC.isAArch64()) {
+    Function.setAlignment(Function.getMinAlignment());
+    return;
+  }
+
   for (const BinaryBasicBlock &BB : Function)
     if (BB.isSplit())
       ColdSize += BC.computeCodeSize(BB.begin(), BB.end(), Emitter);
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 5d44e1a1a4902..1dbd000c1d388 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1287,6 +1287,8 @@ Error AssignSections::runOnFunctions(BinaryContext &BC) {
     if (opts::isHotTextMover(Function)) {
       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
+      // TODO: find a better place to mark a function as a mover.
+      Function.setHotTextMover(true);
       continue;
     }
 
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 4dade161cc232..d37356f043666 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -27,6 +27,11 @@ extern cl::opt<unsigned> AlignFunctions;
 extern cl::opt<bool> UseOldText;
 extern cl::opt<bool> HotFunctionsAtEnd;
 
+static cl::opt<bool>
+    ExperimentalRelaxation("relax-exp",
+                           cl::desc("run experimental relaxation pass"),
+                           cl::init(false), cl::cat(BoltOptCategory));
+
 static cl::opt<bool> GroupStubs("group-stubs",
                                 cl::desc("share stubs across functions"),
                                 cl::init(true), cl::cat(BoltOptCategory));
@@ -893,12 +898,262 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
   }
 }
 
+void LongJmpPass::relaxCalls(BinaryContext &BC) {
+  std::vector<BinaryFunction *> OutputFunctions = BC.getOutputFunctions();
+
+  // Map every function to its direct callees. Note that this is different from
+  // a typical call graph as here we completely ignore indirect calls.
+  uint64_t EstimatedSize = 0;
+  // Conservatively estimate emitted function size.
+  auto estimateFunctionSize = [&](const BinaryFunction &BF) -> uint64_t {
+    if (!BC.shouldEmit(BF))
+      return 0;
+    uint64_t Size = BF.estimateSize();
+    if (BF.hasValidIndex())
+      Size += BF.getAlignment();
+    if (BF.hasIslandsInfo()) {
+      Size += BF.estimateConstantIslandSize();
+      Size += BF.getConstantIslandAlignment();
+    }
+
+    return Size;
+  };
+
+  DenseMap<BinaryFunction *, std::set<BinaryFunction *>> CallMap;
+  for (BinaryFunction *BF : OutputFunctions) {
+    if (!BC.shouldEmit(*BF) || BF->isPatch())
+      continue;
+
+    EstimatedSize += estimateFunctionSize(*BF);
+
+    for (const BinaryBasicBlock &BB : *BF) {
+      for (const MCInst &Inst : BB) {
+        if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) ||
+            BC.MIB->isIndirectBranch(Inst))
+          continue;
+        const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+        assert(TargetSymbol);
+
+        BinaryFunction *Callee = BC.getFunctionForSymbol(TargetSymbol);
+        if (!Callee) {
+          // Ignore internal calls that use basic block labels as a destination.
+          continue;
+        }
+
+        CallMap[BF].insert(Callee);
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LongJmp: estimated code size : " << EstimatedSize
+                    << '\n');
+
+  // Build clusters in the order the functions will appear in the output.
+  std::vector<FunctionCluster> Clusters;
+  for (size_t Index = 0, NumFuncs = OutputFunctions.size(); Index < NumFuncs;
+       ++Index) {
+    const size_t BFIndex =
+        opts::HotFunctionsAtEnd ? NumFuncs - Index - 1 : Index;
+    BinaryFunction *BF = OutputFunctions[BFIndex];
+    if (!BC.shouldEmit(*BF) || BF->isPatch())
+      continue;
+
+    const uint64_t BFSize = estimateFunctionSize(*BF);
+    if (Clusters.empty() || Clusters.back().Size + BFSize > MaxClusterSize) {
+      Clusters.emplace_back(FunctionCluster());
+      Clusters.back().FirstFunctionIndex = BFIndex;
+    }
+
+    FunctionCluster &FC = Clusters.back();
+    FC.Functions.insert(BF);
+    auto It = FC.Callees.find(BF);
+    if (It != FC.Callees.end()) {
+      FC.Callees.erase(It);
+    }
+    FC.Size += BFSize;
+    FC.LastFunctionIndex = BFIndex;
+
+    for (BinaryFunction *Callee : CallMap[BF])
+      if (!FC.Functions.count(Callee))
+        FC.Callees.insert(Callee);
+  }
+
+  if (opts::HotFunctionsAtEnd) {
+    std::reverse(Clusters.begin(), Clusters.end());
+    llvm::for_each(Clusters, [](FunctionCluster &FC) {
+      std::swap(FC.LastFunctionIndex, FC.FirstFunctionIndex);
+    });
+  }
+
+  // Print cluster stats.
+  dbgs() << "Built " << Clusters.size() << " clusters\n";
+  uint64_t ClusterIndex = 0;
+  for (const FunctionCluster &FC : Clusters) {
+    dbgs() << "  Cluster: " << ClusterIndex++ << '\n';
+    dbgs() << "    " << FC.Functions.size() << " functions\n";
+    dbgs() << "    " << FC.Callees.size() << " callees\n";
+    dbgs() << "    " << FC.Size << " bytes\n";
+    dbgs() << "    " << FC.LastFunctionIndex << " is last function\n";
+  }
+
+  // Populate one of the clusters with PLT functions based on the proximity of
+  // the PLT section to avoid unneeded thunk redirection.
+  // FIXME: this part is extremely fragile as it depends on the placement
+  //        of PLT section and its proximity to old or new .text.
+  // FIXME: a slightly better approach will be to always use thunks for PLT and
+  //        eliminate redirection later using final addresses in address maps.
+  const size_t PLTClusterNum = opts::UseOldText ? Clusters.size() - 1 : 0;
+  auto &PLTCluster = Clusters[PLTClusterNum];
+  for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) {
+    if (BF.isPLTFunction()) {
+      PLTCluster.Functions.insert(&BF);
+      auto It = PLTCluster.Callees.find(&BF);
+      if (It != PLTCluster.Callees.end())
+        PLTCluster.Callees.erase(It);
+    }
+  }
+
+  // Build thunk functions.
+  size_t NumThunks = 0;
+  auto createShortThunk = [&](BinaryFunction &Callee) {
+    ++NumThunks;
+    BinaryFunction *ThunkBF = BC.createThunkBinaryFunction(
+        "__AArch64Thunk_" + Callee.getOneName().str());
+    MCInst Inst;
+    BC.MIB->createTailCall(Inst, Callee.getSymbol(), BC.Ctx.get());
+    ThunkBF->addBasicBlock()->addInstruction(Inst);
+
+    return ThunkBF;
+  };
+
+  auto createLongThunk = [&](BinaryFunction &Callee) {
+    ++NumThunks;
+    BinaryFunction *ThunkBF = BC.createThunkBinaryFunction(
+        "__AArch64ADRPThunk_" + Callee.getOneName().str());
+    InstructionListType Instructions;
+    BC.MIB->createLongTailCall(Instructions, Callee.getSymbol(), BC.Ctx.get());
+    ThunkBF->addBasicBlock()->addInstructions(Instructions);
+
+    return ThunkBF;
+  };
+
+  for (unsigned ClusterNum = 0; ClusterNum < Clusters.size(); ++ClusterNum) {
+    FunctionCluster &FC = Clusters[ClusterNum];
+    SmallVector<BinaryFunction *, 16> Callees(FC.Callees.begin(),
+                                              FC.Callees.end());
+    llvm::sort(Callees, compareBinaryFunctionByIndex);
+
+    // Return index of adjacent cluster containing the function.
+    auto getAdjClusterWithFunction =
+        [&](const BinaryFunction *BF) -> std::optional<unsigned> {
+      if (ClusterNum > 0 && Clusters[ClusterNum - 1].Functions.count(BF))
+        return ClusterNum - 1;
+      if (ClusterNum + 1 < Clusters.size() &&
+          Clusters[ClusterNum + 1].Functions.count(BF))
+        return ClusterNum + 1;
+      return std::nullopt;
+    };
+
+    const FunctionCluster *PrevCluster =
+        ClusterNum ? &Clusters[ClusterNum - 1] : nullptr;
+
+    // Create short thunks for callees in adjacent clusters and long thunks
+    // for callees outside.
+    for (BinaryFunction *Callee : Callees) {
+      if (FC.Thunks.count(Callee))
+        continue;
+
+      BinaryFunction *Thunk = 0;
+      std::optional<unsigned> AdjCluster = getAdjClusterWithFunction(Callee);
+      if (AdjCluster) {
+        Thunk = createShortThunk(*Callee);
+      } else {
+        // Previous cluster may already have a long thunk that can be reused.
+        if (PrevCluster) {
+          auto It = PrevCluster->Thunks.find(Callee);
+          // Reuse only if previous cluster hosts this thunk.
+          if (It != PrevCluster->Thunks.end() &&
+              llvm::is_contained(PrevCluster->ThunkList, It->first)) {
+            FC.Thunks[Callee] = It->first;
+            continue;
+          }
+        }
+        Thunk = createLongThunk(*Callee);
+      }
+
+      // The cluster that will host this thunk. If the current cluster is the
+      // last one, try to use the previous one. Matters when we want to have hot
+      // functions at higher addresses under HotFunctionsAtEnd.
+      FunctionCluster *ThunkCluster = &Clusters[ClusterNum];
+      if ((AdjCluster && *AdjCluster == ClusterNum - 1) ||
+          (ClusterNum && ClusterNum == Clusters.size() - 1))
+        ThunkCluster = &Clusters[ClusterNum - 1];
+      ThunkCluster->ThunkList.push_back(Thunk);
+      FC.Thunks[Callee] = Thunk;
+    }
+  }
+
+  BC.outs() << "BOLT-INFO: " << NumThunks << " thunks created\n";
+
+  // Replace callees with thunks.
+  for (FunctionCluster &FC : Clusters) {
+    for (BinaryFunction *BF : FC.Functions) {
+      if (!CallMap.count(BF))
+        continue;
+
+      for (BinaryBasicBlock &BB : *BF) {
+        for (MCInst &Inst : BB) {
+          if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) ||
+              BC.MIB->isIndirectBranch(Inst))
+            continue;
+          const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+          assert(TargetSymbol);
+
+          BinaryFunction *Callee = BC.getFunctionForSymbol(TargetSymbol);
+          if (!Callee) {
+            // Ignore internal calls that use basic block labels as a
+            // destination.
+            continue;
+          }
+
+          // Check if the callee is in the same cluster.
+          if (!FC.Callees.count(Callee))
+            continue;
+
+          // Use thunk as the call destination.
+          BC.MIB->replaceBranchTarget(Inst, FC.Thunks[Callee]->getSymbol(),
+                                      BC.Ctx.get());
+        }
+      }
+    }
+  }
+
+  // Add thunks to the function list and assign a section name matching the
+  // function they follow.
+  for (const FunctionCluster &FC : llvm::reverse(Clusters)) {
+    std::string SectionName =
+        OutputFunctions[FC.LastFunctionIndex]->getCodeSectionName().str().str();
+    for (BinaryFunction *Thunk : FC.ThunkList) {
+      Thunk->setCodeSectionName(SectionName);
+    }
+
+    OutputFunctions.insert(
+        std::next(OutputFunctions.begin(), FC.LastFunctionIndex + 1),
+        FC.ThunkList.begin(), FC.ThunkList.end());
+  }
+  BC.updateOutputFunctions(OutputFunctions);
+
+  LLVM_DEBUG(dbgs() << "\nFunction layout with thunks:\n";
+             for (const auto *BF : OutputFunctions) { dbgs() << *BF << '\n'; });
+}
+
 Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
 
-  if (opts::CompactCodeModel) {
+  if (opts::CompactCodeModel || opts::ExperimentalRelaxation) {
     BC.outs()
         << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
 
+  // TODO: set correct code model based on the total size of split-code.
     ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
       relaxLocalBranches(BF);
     };
@@ -912,11 +1167,17 @@ Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
         BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
         SkipPredicate, "RelaxLocalBranches");
 
+    if (!opts::ExperimentalRelaxation)
+      return Error::success();
+
+    BC.outs() << "BOLT-INFO: starting experimental relaxation pass\n";
+    relaxCalls(BC);
+
     return Error::success();
   }
 
   BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
-  std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
+  std::vector<BinaryFunction *> Sorted = BC.getOutputFunctions();
   bool Modified;
   uint32_t Iterations = 0;
   do {
diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp
index 55f7513615e7d..aae6634c60646 100644
--- a/bolt/lib/Passes/PatchEntries.cpp
+++ b/bolt/lib/Passes/PatchEntries.cpp
@@ -36,17 +36,21 @@ Error PatchEntries::runOnFunctions(BinaryContext &BC) {
   if (!opts::ForcePatch) {
     // Mark the binary for patching if we did not create external references
     // for original code in any of functions we are not going to emit.
-    bool NeedsPatching = llvm::any_of(
-        llvm::make_second_range(BC.getBinaryFunctions()),
-        [&](BinaryFunction &BF) {
-          return (!BC.shouldEmit(BF) && !BF.hasExternalRefRelocations()) ||
-                 BF.needsPatch();
-        });
+    bool NeedsPatching =
+        llvm::any_of(llvm::make_second_range(BC.getBinaryFunctions()),
+                     [&](BinaryFunction &BF) {
+                       return (!BF.isPseudo() && !BC.shouldEmit(BF) &&
+                               !BF.hasExternalRefRelocations()) ||
+                              BF.needsPatch();
+                     });
 
     if (!NeedsPatching)
       return Error::success();
   }
 
+  assert(!opts::UseOldText &&
+         "Cannot patch entries while overwriting original .text");
+
   if (opts::Verbosity >= 1)
     BC.outs() << "BOLT-INFO: patching entries in original code\n";
 
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index b21401e069bfa..61fb4b9e68f6e 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -229,7 +229,7 @@ struct SplitCacheDirected final : public SplitStrategy {
   }
 
   void initializeAuxiliaryVariables() {
-    for (BinaryFunction *BF : BC.getSortedFunctions()) {
+    for (BinaryFunction *BF : BC.getOutputFunctions()) {
       if (!shouldConsiderForCallGraph(*BF))
         continue;
 
@@ -257,7 +257,7 @@ struct SplitCacheDirected final : public SplitStrategy {
   void buildCallGraph() {
     Callers.resize(TotalNumBlocks);
     Callees.resize(TotalNumBlocks);
-    for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
+    for (const BinaryFunction *SrcFunction : BC.getOutputFunctions()) {
       if (!shouldConsiderForCallGraph(*SrcFunction))
         continue;
 
@@ -360,7 +360,7 @@ struct SplitCacheDirected final : public SplitStrategy {
     const BinaryBasicBlock *ThisBB = &(ThisBF->front());
     const size_t ThisGI = GlobalIndices[ThisBB];
 
-    for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
+    for (const BinaryFunction *DstBF : BC.getOutputFunctions()) {
       if (!shouldConsiderForCallGraph(*DstBF))
         continue;
 
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 905728de8262f..e236cbab8fc87 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2321,12 +2321,11 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
       const BoltAddressTranslation::BBHashMapTy &BlockMap =
           BAT->getBBHashMap(FuncAddress);
       YamlBF.Blocks.resize(YamlBF.NumBasicBlocks);
+      for (size_t Idx = 0; Idx != YamlBF.NumBasicBlocks; ++Idx)
+        YamlBF.Blocks[Idx].Index = Idx;
 
-      for (auto &&[Entry, YamlBB] : llvm::zip(BlockMap, YamlBF.Blocks)) {
-        const auto &Block = Entry.second;
-        YamlBB.Hash = Block.Hash;
-        YamlBB.Index = Block.Index;
-      }
+      for (const auto &Block : llvm::make_second_range(BlockMap))
+        YamlBF.Blocks[Block.Index].Hash = Block.Hash;
 
       // Lookup containing basic block offset and index
       auto getBlock = [&BlockMap](uint32_t Offset) {
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 996d2e972599d..80978f4529f1f 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -501,6 +501,9 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   if (BC.HasRelocations)
     Manager.registerPass(std::make_unique<PatchEntries>());
 
+  // Assign each function an output section.
+  Manager.registerPass(std::make_unique<AssignSections>());
+
   if (BC.isAArch64()) {
     Manager.registerPass(
         std::make_unique<ADRRelaxationPass>(PrintAdrRelaxation));
@@ -525,9 +528,6 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   Manager.registerPass(
       std::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));
 
-  // Assign each function an output section.
-  Manager.registerPass(std::make_unique<AssignSections>());
-
   // This pass turns tail calls into jumps which makes them invisible to
   // function reordering. It's unsafe to use any CFG or instruction analysis
   // after this point.
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 96045a916232c..7fbc933cade5b 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -2515,6 +2515,13 @@ void RewriteInstance::readDynamicRelocations(const SectionRef &Section,
       exit(1);
     }
 
+    // Workaround for AArch64 issue with hot text.
+    if (BC->isAArch64() && (SymbolName == "__hot_start" ||
+          SymbolName == "__hot_end")) {
+      BC->addRelocation(Rel.getOffset(), Symbol, ELF::R_AARCH64_ABS64, Addend);
+      continue;
+    }
+
     BC->addDynamicRelocation(Rel.getOffset(), Symbol, RType, Addend);
   }
 }
@@ -3147,6 +3154,11 @@ void RewriteInstance::selectFunctionsToProcess() {
     if (mustSkip(Function))
       return false;
 
+    // Include veneer functions as we want to replace veneer calls with
+    // direct ones.
+    if (BC->isAArch64() && Function.getOneName().starts_with("__AArch64"))
+      return true;
+
     // If the list is not empty, only process functions from the list.
     if (!opts::ForceFunctionNames.empty() || !ForceFunctionsNR.empty()) {
       // Regex check (-funcs and -funcs-file options).
@@ -3748,7 +3760,7 @@ void RewriteInstance::emitAndLink() {
 
   if (opts::PrintCacheMetrics) {
     BC->outs() << "BOLT-INFO: cache metrics after emitting functions:\n";
-    CacheMetrics::printAll(BC->outs(), BC->getSortedFunctions());
+    CacheMetrics::printAll(BC->outs(), BC->getAllBinaryFunctions());
   }
 }
 
diff --git a/bolt/test/AArch64/veneer-bolt.s b/bolt/test/AArch64/veneer-bolt.s
new file mode 100644
index 0000000000000..59759b6a9171e
--- /dev/null
+++ b/bolt/test/AArch64/veneer-bolt.s
@@ -0,0 +1,66 @@
+## Check that llvm-bolt handles code larger than 256MB.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -nostdlib -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --relax-exp --lite=0
+# RUN: llvm-bolt %t.exe -o %t.bolt --relax-exp --lite=1 --data %t.fdata \
+# RUN:   --print-normalized 2>&1 | FileCheck %s --check-prefix=CHECK-VENEER
+# RUN: llvm-bolt %t.exe -o %t.bolt --relax-exp --hot-functions-at-end --lite=0 \
+# RUN:   --data %t.fdata
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s
+
+## The constant island at the end of a function makes it ~112MB in size.
+## Thus the total code size exceeds 300MB.
+
+  .text
+  .global foo
+  .type foo, %function
+foo:
+  bl _start
+  bl bar
+  ret
+  .space 0x7000000
+  .size foo, .-foo
+
+  .global bar
+  .type bar, %function
+bar:
+  bl foo
+  bl _start
+  ret
+  .space 0x7000000
+  .size bar, .-bar
+
+  .global hot
+  .type hot, %function
+hot:
+# FDATA: 0 [unknown] 0 1 hot 0 0 100
+  bl foo
+  bl bar
+  bl _start
+  ret
+  .size hot, .-hot
+
+## Check that BOLT sees the call to foo, not to its veneer.
+# CHECK-VENEER-LABEL: Binary Function "hot"
+# CHECK-VENEER: bl
+# CHECK-VENEER-SAME: {{[[:space:]]foo[[:space:]]}}
+
+## Check that BOLT-introduced veneers have proper names.
+# CHECK-LABEL: <hot>:
+# CHECK-NEXT: bl {{.*}} <__AArch64ADRPThunk_foo>
+# CHECK-NEXT: bl {{.*}} <__AArch64Thunk_bar>
+# CHECK-NEXT: bl {{.*}} <_start>
+
+  .global _start
+  .type _start, %function
+_start:
+  bl foo
+  bl bar
+  bl hot
+  ret
+  .space 0x7000000
+  .size _start, .-_start
+
diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s
index 66f30036de8c1..b2e7ba1f1ccee 100644
--- a/bolt/test/X86/cdsplit-call-scale.s
+++ b/bolt/test/X86/cdsplit-call-scale.s
@@ -20,7 +20,7 @@
 # RUN:     2>&1 | FileCheck --check-prefix=MEDINCENTIVE %s
 # RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
 # RUN:         --call-scale=1000.0 --print-split --print-only=chain \
-# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp --force-patch \
 # RUN:     2>&1 | FileCheck --check-prefix=HIGHINCENTIVE %s
 
 # LOWINCENTIVE: Binary Function "chain" after split-functions
diff --git a/clang-tools-extra/README.txt b/clang-tools-extra/README.txt
index 6891e4078997f..1195db9b468dd 100644
--- a/clang-tools-extra/README.txt
+++ b/clang-tools-extra/README.txt
@@ -8,12 +8,13 @@ Clang frontend.  These tools are kept in a separate "extra" repository to
 allow lighter weight checkouts of the core Clang codebase.
 
 All discussion regarding Clang, Clang-based tools, and code in this repository
-should be held using the standard Clang forum:
+should be held using the standard Clang forums:
   https://discourse.llvm.org/c/clang
+  https://discourse.llvm.org/c/clang/clang-tidy/71
+  https://discourse.llvm.org/c/clang/clangd/34
 
-Code review for this tree should take place on the standard Clang patch and
-commit lists:
-  http://lists.llvm.org/mailman/listinfo/cfe-commits
+Code review for this tree should take place on Github:
+  https://github.com/llvm/llvm-project/pulls?q=label%3Aclang-tools-extra
 
 If you find a bug in these tools, please file it in the LLVM bug tracker:
   https://github.com/llvm/llvm-project/issues/
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 6fdc7196e9095..cc4c68346ec53 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -45,7 +45,7 @@ static auto SerializeReferenceLambda = [](const auto &Ref, Object &Object) {
 
 static json::Object
 serializeLocation(const Location &Loc,
-                  const std::optional<StringRef> &RepositoryUrl) {
+                  const std::optional<StringRef> RepositoryUrl) {
   Object LocationObj = Object();
   LocationObj["LineNumber"] = Loc.StartLineNumber;
   LocationObj["Filename"] = Loc.Filename;
@@ -169,7 +169,7 @@ static json::Value serializeComment(const CommentInfo &I) {
 
 static void
 serializeCommonAttributes(const Info &I, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryUrl) {
+                          const std::optional<StringRef> RepositoryUrl) {
   Obj["Name"] = I.Name;
   Obj["USR"] = toHex(toStringRef(I.USR));
 
@@ -211,9 +211,9 @@ static void serializeReference(const Reference &Ref, Object &ReferenceObj) {
 // differently. Only enums, records, and typedefs are handled here.
 static void
 serializeCommonChildren(const ScopeChildren &Children, json::Object &Obj,
-                        const std::optional<StringRef> &RepositoryUrl) {
-  static auto SerializeInfo = [&RepositoryUrl](const auto &Info,
-                                               Object &Object) {
+                        const std::optional<StringRef> RepositoryUrl) {
+  static auto SerializeInfo = [RepositoryUrl](const auto &Info,
+                                              Object &Object) {
     serializeInfo(Info, Object, RepositoryUrl);
   };
 
@@ -304,7 +304,7 @@ static void serializeInfo(const FieldTypeInfo &I, Object &Obj) {
 }
 
 static void serializeInfo(const FunctionInfo &F, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryURL) {
+                          const std::optional<StringRef> RepositoryURL) {
   serializeCommonAttributes(F, Obj, RepositoryURL);
   Obj["IsStatic"] = F.IsStatic;
 
@@ -459,7 +459,7 @@ static void serializeInfo(const RecordInfo &I, json::Object &Obj,
 }
 
 static void serializeInfo(const VarInfo &I, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryUrl) {
+                          const std::optional<StringRef> RepositoryUrl) {
   serializeCommonAttributes(I, Obj, RepositoryUrl);
   Obj["IsStatic"] = I.IsStatic;
   auto TypeObj = Object();
@@ -468,15 +468,15 @@ static void serializeInfo(const VarInfo &I, json::Object &Obj,
 }
 
 static void serializeInfo(const NamespaceInfo &I, json::Object &Obj,
-                          const std::optional<StringRef> &RepositoryUrl) {
+                          const std::optional<StringRef> RepositoryUrl) {
   serializeCommonAttributes(I, Obj, RepositoryUrl);
 
   if (!I.Children.Namespaces.empty())
     serializeArray(I.Children.Namespaces, Obj, "Namespaces",
                    SerializeReferenceLambda);
 
-  static auto SerializeInfo = [&RepositoryUrl](const auto &Info,
-                                               Object &Object) {
+  static auto SerializeInfo = [RepositoryUrl](const auto &Info,
+                                              Object &Object) {
     serializeInfo(Info, Object, RepositoryUrl);
   };
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
index 33642c407a3a9..bfa2ab51a6d03 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp
@@ -45,7 +45,7 @@ static void replaceMoveWithForward(const UnresolvedLookupExpr *Callee,
       // We still conservatively put a "std::" in front of the forward because
       // we don't know whether the code also had a "using std::forward;".
       Diag << FixItHint::CreateReplacement(CallRange, "std::" + ForwardName);
-    } else if (const NamespaceDecl *Namespace = NNS->getAsNamespace()) {
+    } else if (const NamespaceBaseDecl *Namespace = NNS->getAsNamespace()) {
       if (Namespace->getName() == "std") {
         if (!NNS->getPrefix()) {
           // Called as "std::move".
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
index 2dfaca19a8981..86992cd8a141b 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp
@@ -36,7 +36,8 @@ void UnusedAliasDeclsCheck::check(const MatchFinder::MatchResult &Result) {
 
   if (const auto *NestedName =
           Result.Nodes.getNodeAs<NestedNameSpecifier>("nns")) {
-    if (const auto *AliasDecl = NestedName->getAsNamespaceAlias()) {
+    if (const auto *AliasDecl = dyn_cast_if_present<NamespaceAliasDecl>(
+            NestedName->getAsNamespace())) {
       FoundDecls[AliasDecl] = CharSourceRange();
     }
   }
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 6cf38ddf3d914..dd28806e008ed 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -282,7 +282,8 @@ class RenamerClangTidyVisitor
 
   bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc Loc) {
     if (const NestedNameSpecifier *Spec = Loc.getNestedNameSpecifier()) {
-      if (const NamespaceDecl *Decl = Spec->getAsNamespace())
+      if (const auto *Decl =
+              dyn_cast_if_present<NamespaceDecl>(Spec->getAsNamespace()))
         Check->addUsage(Decl, Loc.getLocalSourceRange(), SM);
     }
 
diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp
index e274236527817..f2631e5abb6a3 100644
--- a/clang-tools-extra/clangd/AST.cpp
+++ b/clang-tools-extra/clangd/AST.cpp
@@ -666,12 +666,14 @@ std::string getQualification(ASTContext &Context,
   return getQualification(
       Context, DestContext, ND->getDeclContext(),
       [&](NestedNameSpecifier *NNS) {
-        if (NNS->getKind() != NestedNameSpecifier::Namespace)
+        const NamespaceDecl *NS =
+            dyn_cast_if_present<NamespaceDecl>(NNS->getAsNamespace());
+        if (!NS)
           return false;
-        const auto *CanonNSD = NNS->getAsNamespace()->getCanonicalDecl();
+        NS = NS->getCanonicalDecl();
         return llvm::any_of(VisibleNamespaceDecls,
-                            [CanonNSD](const NamespaceDecl *NSD) {
-                              return NSD->getCanonicalDecl() == CanonNSD;
+                            [NS](const NamespaceDecl *NSD) {
+                              return NSD->getCanonicalDecl() == NS;
                             });
       });
 }
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index d5907e3143bf6..184c3c962f063 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1470,7 +1470,6 @@ bool allowIndex(CodeCompletionContext &CC) {
   switch (NameSpec->getKind()) {
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
     return true;
   case NestedNameSpecifier::Super:
   case NestedNameSpecifier::TypeSpec:
diff --git a/clang-tools-extra/clangd/DumpAST.cpp b/clang-tools-extra/clangd/DumpAST.cpp
index 8f24477ecd3de..c6075e75e9a6b 100644
--- a/clang-tools-extra/clangd/DumpAST.cpp
+++ b/clang-tools-extra/clangd/DumpAST.cpp
@@ -158,7 +158,6 @@ class DumpVisitor : public RecursiveASTVisitor<DumpVisitor> {
       NNS_KIND(TypeSpec);
       NNS_KIND(Global);
       NNS_KIND(Super);
-      NNS_KIND(NamespaceAlias);
 #undef NNS_KIND
     }
     llvm_unreachable("Unhandled SpecifierKind enum");
@@ -281,8 +280,6 @@ class DumpVisitor : public RecursiveASTVisitor<DumpVisitor> {
       return NNS.getAsIdentifier()->getName().str() + "::";
     case NestedNameSpecifier::Namespace:
       return NNS.getAsNamespace()->getNameAsString() + "::";
-    case NestedNameSpecifier::NamespaceAlias:
-      return NNS.getAsNamespaceAlias()->getNameAsString() + "::";
     default:
       return "";
     }
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index 91fd3b0f8567b..b1089577ba819 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -491,9 +491,6 @@ struct TargetFinder {
     case NestedNameSpecifier::Namespace:
       add(NNS->getAsNamespace(), Flags);
       return;
-    case NestedNameSpecifier::NamespaceAlias:
-      add(NNS->getAsNamespaceAlias(), Flags);
-      return;
     case NestedNameSpecifier::Identifier:
       if (Resolver) {
         add(Resolver->resolveNestedNameSpecifierToType(NNS), Flags);
diff --git a/clang-tools-extra/clangd/IncludeFixer.cpp b/clang-tools-extra/clangd/IncludeFixer.cpp
index 4ff021c4c390a..50bc2bd7ccb94 100644
--- a/clang-tools-extra/clangd/IncludeFixer.cpp
+++ b/clang-tools-extra/clangd/IncludeFixer.cpp
@@ -403,25 +403,27 @@ std::optional<CheapUnresolvedName> extractUnresolvedNameCheaply(
     if (auto *Nested = SS->getScopeRep()) {
       if (Nested->getKind() == NestedNameSpecifier::Global) {
         Result.ResolvedScope = "";
-      } else if (const auto *NS = Nested->getAsNamespace()) {
-        std::string SpecifiedNS = printNamespaceScope(*NS);
-        std::optional<std::string> Spelling = getSpelledSpecifier(*SS, SM);
-
-        // Check the specifier spelled in the source.
-        // If the resolved scope doesn't end with the spelled scope, the
-        // resolved scope may come from a sema typo correction. For example,
-        // sema assumes that "clangd::" is a typo of "clang::" and uses
-        // "clang::" as the specified scope in:
-        //     namespace clang { clangd::X; }
-        // In this case, we use the "typo" specifier as extra scope instead
-        // of using the scope assumed by sema.
-        if (!Spelling || llvm::StringRef(SpecifiedNS).ends_with(*Spelling)) {
-          Result.ResolvedScope = std::move(SpecifiedNS);
+      } else if (const NamespaceBaseDecl *NSB = Nested->getAsNamespace()) {
+        if (const auto *NS = dyn_cast<NamespaceDecl>(NSB)) {
+          std::string SpecifiedNS = printNamespaceScope(*NS);
+          std::optional<std::string> Spelling = getSpelledSpecifier(*SS, SM);
+
+          // Check the specifier spelled in the source.
+          // If the resolved scope doesn't end with the spelled scope, the
+          // resolved scope may come from a sema typo correction. For example,
+          // sema assumes that "clangd::" is a typo of "clang::" and uses
+          // "clang::" as the specified scope in:
+          //     namespace clang { clangd::X; }
+          // In this case, we use the "typo" specifier as extra scope instead
+          // of using the scope assumed by sema.
+          if (!Spelling || llvm::StringRef(SpecifiedNS).ends_with(*Spelling)) {
+            Result.ResolvedScope = std::move(SpecifiedNS);
+          } else {
+            Result.UnresolvedScope = std::move(*Spelling);
+          }
         } else {
-          Result.UnresolvedScope = std::move(*Spelling);
+          Result.ResolvedScope = printNamespaceScope(*cast<NamespaceAliasDecl>(NSB)->getNamespace());
         }
-      } else if (const auto *ANS = Nested->getAsNamespaceAlias()) {
-        Result.ResolvedScope = printNamespaceScope(*ANS->getNamespace());
       } else {
         // We don't fix symbols in scopes that are not top-level e.g. class
         // members, as we don't collect includes for them.
diff --git a/clang-tools-extra/clangd/ModulesBuilder.cpp b/clang-tools-extra/clangd/ModulesBuilder.cpp
index 6658111d6c7b4..56d00c22674e5 100644
--- a/clang-tools-extra/clangd/ModulesBuilder.cpp
+++ b/clang-tools-extra/clangd/ModulesBuilder.cpp
@@ -219,8 +219,9 @@ bool IsModuleFileUpToDate(PathRef ModuleFilePath,
 
   IntrusiveRefCntPtr<ModuleCache> ModCache = createCrossProcessModuleCache();
   PCHContainerOperations PCHOperations;
+  CodeGenOptions CodeGenOpts;
   ASTReader Reader(PP, *ModCache, /*ASTContext=*/nullptr,
-                   PCHOperations.getRawReader(), {});
+                   PCHOperations.getRawReader(), CodeGenOpts, {});
 
   // We don't need any listener here. By default it will use a validator
   // listener.
diff --git a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
index 00c05ebdb5216..67fc451a6a1a1 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
@@ -173,7 +173,8 @@ findInsertionPoint(const Tweak::Selection &Inputs,
     if (SM.isBeforeInTranslationUnit(Inputs.Cursor, U->getUsingLoc()))
       // "Usings" is sorted, so we're done.
       break;
-    if (const auto *Namespace = U->getQualifier()->getAsNamespace()) {
+    if (const auto *Namespace = dyn_cast_if_present<NamespaceDecl>(
+            U->getQualifier()->getAsNamespace())) {
       if (Namespace->getCanonicalDecl() ==
               QualifierToRemove.getNestedNameSpecifier()
                   ->getAsNamespace()
@@ -232,7 +233,10 @@ findInsertionPoint(const Tweak::Selection &Inputs,
 
 bool isNamespaceForbidden(const Tweak::Selection &Inputs,
                           const NestedNameSpecifier &Namespace) {
-  std::string NamespaceStr = printNamespaceScope(*Namespace.getAsNamespace());
+  const auto *NS = dyn_cast<NamespaceDecl>(Namespace.getAsNamespace());
+  if (!NS)
+    return true;
+  std::string NamespaceStr = printNamespaceScope(*NS);
 
   for (StringRef Banned : Config::current().Style.FullyQualifiedNamespaces) {
     StringRef PrefixMatch = NamespaceStr;
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index cf97ab7082472..07ebf8008928d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -58,9 +58,6 @@ Semantic Highlighting
 Compile flags
 ^^^^^^^^^^^^^
 
-- Added `BuiltinHeaders` config key which controls whether clangd's built-in
-  headers are used or ones extracted from the driver.
-
 Hover
 ^^^^^
 
@@ -88,301 +85,23 @@ Improvements to clang-doc
 Improvements to clang-query
 ---------------------------
 
-Improvements to include-cleaner
--------------------------------
-- Deprecated the ``-insert`` and ``-remove`` command line options, and added
-  the ``-disable-remove`` and ``-disable-insert`` command line options as
-  replacements. The previous command line options were confusing because they
-  did not imply the default state of the option (which is inserts and removes
-  being enabled). The new options are easier to understand the semantics of.
+- Matcher queries interpreted by clang-query are now support trailing comma (,)
+  in matcher arguments. Note that C++ still doesn't allow this in function
+  arguments. So when porting a query to C++, remove all instances of trailing
+  comma (otherwise C++ compiler will just complain about "expected expression").
 
 Improvements to clang-tidy
 --------------------------
 
-- Changed the :program:`check_clang_tidy.py` tool to use FileCheck's
-  ``--match-full-lines`` instead of ``strict-whitespace`` for ``CHECK-FIXES``
-  clauses. Added a ``--match-partial-fixes`` option to keep previous behavior on
-  specific tests. This may break tests for users with custom out-of-tree checks
-  who use :program:`check_clang_tidy.py` as-is.
-
-- Improved :program:`clang-tidy-diff.py` script. Add the `-warnings-as-errors`
-  argument to treat warnings as errors.
-
-- Improved :program:`clang-tidy` to show `CheckOptions` only for checks enabled
-  in `Checks` when running ``--dump-config``.
-
-- Fixed bug in :program:`clang-tidy` by which `HeaderFilterRegex` did not take
-  effect when passed via the `.clang-tidy` file.
-
-- Fixed bug in :program:`run_clang_tidy.py` where the program would not
-  correctly display the checks enabled by the top-level `.clang-tidy` file.
-
 New checks
 ^^^^^^^^^^
 
-- New :doc:`bugprone-capturing-this-in-member-variable
-  <clang-tidy/checks/bugprone/capturing-this-in-member-variable>` check.
-
-  Finds lambda captures and ``bind`` function calls that capture the ``this``
-  pointer and store it as class members without handle the copy and move
-  constructors and the assignments.
-
-- New :doc:`bugprone-misleading-setter-of-reference
-  <clang-tidy/checks/bugprone/misleading-setter-of-reference>` check.
-
-  Finds setter-like member functions that take a pointer parameter and set a
-  reference member of the same class with the pointed value.
-
-- New :doc:`bugprone-unintended-char-ostream-output
-  <clang-tidy/checks/bugprone/unintended-char-ostream-output>` check.
-
-  Finds unintended character output from ``unsigned char`` and ``signed char``
-  to an ``ostream``.
-
-- New :doc:`cppcoreguidelines-use-enum-class
-  <clang-tidy/checks/cppcoreguidelines/use-enum-class>` check.
-
-  Finds unscoped (non-class) ``enum`` declarations and suggests using
-  ``enum class`` instead.
-
-- New :doc:`llvm-prefer-static-over-anonymous-namespace
-  <clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace>` check.
-
-  Finds function and variable declarations inside anonymous namespace and
-  suggests replacing them with ``static`` declarations.
-
-- New :doc:`modernize-use-scoped-lock
-  <clang-tidy/checks/modernize/use-scoped-lock>` check.
-
-  Finds uses of ``std::lock_guard`` and suggests replacing them with C++17's
-  alternative ``std::scoped_lock``.
-
-- New :doc:`portability-avoid-pragma-once
-  <clang-tidy/checks/portability/avoid-pragma-once>` check.
-
-  Finds uses of ``#pragma once`` and suggests replacing them with standard
-  include guards (``#ifndef``/``#define``/``#endif``) for improved portability.
-
-- New :doc:`readability-ambiguous-smartptr-reset-call
-  <clang-tidy/checks/readability/ambiguous-smartptr-reset-call>` check.
-
-  Finds potentially erroneous calls to ``reset`` method on smart pointers when
-  the pointee type also has a ``reset`` method.
-
-- New :doc:`readability-use-concise-preprocessor-directives
-  <clang-tidy/checks/readability/use-concise-preprocessor-directives>` check.
-
-  Finds uses of ``#if`` that can be simplified to ``#ifdef`` or ``#ifndef`` and,
-  since C23 and C++23, uses of ``#elif`` that can be simplified to ``#elifdef``
-  or ``#elifndef``.
-
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Improved :doc:`bugprone-crtp-constructor-accessibility
-  <clang-tidy/checks/bugprone/crtp-constructor-accessibility>` check by fixing
-  false positives on deleted constructors that cannot be used to construct
-  objects, even if they have public or protected access.
-
-- Improved :doc:`bugprone-exception-escape
-  <clang-tidy/checks/bugprone/exception-escape>` check to print stack trace
-  of a potentially escaped exception.
-
-- Added an option to :doc:`bugprone-multi-level-implicit-pointer-conversion
-  <clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion>` to
-  choose whether to enable the check in C code or not.
-
-- Improved :doc:`bugprone-optional-value-conversion
-  <clang-tidy/checks/bugprone/optional-value-conversion>` check to detect
-  conversion in argument of ``std::make_optional``.
-
-- Improved :doc:`bugprone-sizeof-expression
-  <clang-tidy/checks/bugprone/sizeof-expression>` check by adding
-  `WarnOnSizeOfInLoopTermination` option to detect misuses of ``sizeof``
-  expression in loop conditions.
-
-- Improved :doc:`bugprone-string-constructor
-  <clang-tidy/checks/bugprone/string-constructor>` check to find suspicious
-  calls of ``std::string`` constructor with char pointer, start position and
-  length parameters.
-
-- Improved :doc:`bugprone-unchecked-optional-access
-  <clang-tidy/checks/bugprone/unchecked-optional-access>` fixing false
-  positives from smart pointer accessors repeated in checking ``has_value``
-  and accessing ``value``. The option `IgnoreSmartPointerDereference` should
-  no longer be needed and will be removed. Also fixing false positive from
-  const reference accessors to objects containing optional member.
-
-- Improved :doc:`bugprone-unsafe-functions
-  <clang-tidy/checks/bugprone/unsafe-functions>` check to allow specifying
-  additional C++ member functions to match.
-
-- Improved :doc:`cert-err33-c
-  <clang-tidy/checks/cert/err33-c>` check by fixing false positives when
-  a function name is just prefixed with a targeted function name.
-
-- Improved :doc:`concurrency-mt-unsafe
-  <clang-tidy/checks/concurrency/mt-unsafe>` check by fixing a false positive
-  where ``strerror`` was flagged as MT-unsafe.
-
-- Improved :doc:`cppcoreguidelines-avoid-goto
-  <clang-tidy/checks/cppcoreguidelines/avoid-goto>` check by adding the option
-  `IgnoreMacros` to ignore ``goto`` labels defined in macros.
-
-- Improved :doc:`cppcoreguidelines-interfaces-global-init
-  <clang-tidy/checks/cppcoreguidelines/interfaces-global-init>` check by
-  fixing false positives on uses of ``constinit`` variables.
-
-- Improved :doc:`cppcoreguidelines-missing-std-forward
-  <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check by adding a
-  flag to specify the function used for forwarding instead of ``std::forward``.
-
-- Improved :doc:`cppcoreguidelines-pro-bounds-pointer-arithmetic
-  <clang-tidy/checks/cppcoreguidelines/pro-bounds-pointer-arithmetic>` check by
-  fixing false positives when calling indexing operators that do not perform
-  pointer arithmetic in template, for example ``std::map::operator[]`` and
-  when pointer arithmetic was used through type aliases.
-
-- Improved :doc:`cppcoreguidelines-rvalue-reference-param-not-moved
-  <clang-tidy/checks/cppcoreguidelines/rvalue-reference-param-not-moved>` check
-  by adding a flag to specify the function used for moving instead of
-  ``std::move``.
-
-- Improved :doc:`cppcoreguidelines-special-member-functions
-  <clang-tidy/checks/cppcoreguidelines/special-member-functions>` check by
-  adding the option `IgnoreMacros` to ignore classes defined in macros.
-
-- Improved :doc:`google-readability-namespace-comments
-  <clang-tidy/checks/google/readability-namespace-comments>` check by adding
-  the option `AllowOmittingNamespaceComments` to accept if a namespace comment
-  is omitted entirely.
-
-- Improved :doc:`hicpp-avoid-goto
-  <clang-tidy/checks/hicpp/avoid-goto>` check by adding the option
-  `IgnoreMacros` to ignore ``goto`` labels defined in macros.
-
-- Improved :doc:`hicpp-special-member-functions
-  <clang-tidy/checks/hicpp/special-member-functions>` check by adding the
-  option `IgnoreMacros` to ignore classes defined in macros.
-
-- Improved :doc:`llvm-namespace-comment
-  <clang-tidy/checks/llvm/namespace-comment>` check by adding the option
-  `AllowOmittingNamespaceComments` to accept if a namespace comment is omitted
-  entirely.
-
-- Improved :doc:`misc-const-correctness
-  <clang-tidy/checks/misc/const-correctness>` check by adding the option
-  `AllowedTypes`, that excludes specified types from const-correctness
-  checking and fixing false positives when modifying variant by ``operator[]``
-  with template in parameters and supporting to check pointee mutation by
-  `AnalyzePointers` option and fixing false positives when using const array
-  type.
-
-- Improved :doc:`misc-include-cleaner
-  <clang-tidy/checks/misc/include-cleaner>` check by adding the options
-  `UnusedIncludes` and `MissingIncludes`, which specify whether the check should
-  report unused or missing includes respectively.
-
-- Improved :doc:`misc-redundant-expression
-  <clang-tidy/checks/misc/redundant-expression>` check by providing additional
-  examples and fixing some macro related false positives.
-
-- Improved :doc:`misc-unconventional-assign-operator
-  <clang-tidy/checks/misc/unconventional-assign-operator>` check by fixing
-  false positives when copy assignment operator function in a template class
-  returns the result of another assignment to ``*this`` (``return *this=...``).
-
-- Improved :doc:`misc-unused-using-decls
-  <clang-tidy/checks/misc/unused-using-decls>` check by fixing false positives
-  on ``operator""`` with template parameters.
-
-- Improved :doc:`misc-use-internal-linkage
-  <clang-tidy/checks/misc/use-internal-linkage>` check by fix false positives
-  for function or variable in header file which contains macro expansion and
-  excluding variables with ``thread_local`` storage class specifier from being
-  matched.
-
-- Improved :doc:`modernize-pass-by-value
-  <clang-tidy/checks/modernize/pass-by-value>` check by fixing false positives
-  when class passed by const-reference had a private move constructor.
-
-- Improved :doc:`modernize-type-traits
-  <clang-tidy/checks/modernize/type-traits>` check by detecting more type traits.
-
-- Improved :doc:`modernize-use-default-member-init
-  <clang-tidy/checks/modernize/use-default-member-init>` check by matching
-  arithmetic operations, ``constexpr`` and ``static`` values, and detecting
-  explicit casting of built-in types within member list initialization.
-
-- Improved :doc:`modernize-use-designated-initializers
-  <clang-tidy/checks/modernize/use-designated-initializers>` check by avoiding
-  diagnosing designated initializers for ``std::array`` initializations.
-
-- Improved :doc:`modernize-use-ranges
-  <clang-tidy/checks/modernize/use-ranges>` check by updating suppress
-  warnings logic for ``nullptr`` in ``std::find``.
-
-- Improved :doc:`modernize-use-starts-ends-with
-  <clang-tidy/checks/modernize/use-starts-ends-with>` check by adding more
-  matched scenarios of ``find`` and ``rfind`` methods and fixing false
-  positives when those methods were called with 3 arguments.
-
-- Improved :doc:`modernize-use-std-numbers
-  <clang-tidy/checks/modernize/use-std-numbers>` check to support math
-  functions of different precisions.
-
-- Improved :doc:`modernize-use-trailing-return-type
-  <clang-tidy/checks/modernize/use-trailing-return-type>` check by adding
-  support to modernize lambda signatures to use trailing return type and adding
-  two new options: `TransformFunctions` and `TransformLambdas` to control
-  whether function declarations and lambdas should be transformed by the check.
-  Fixed false positives when lambda was matched as a function in C++11 mode.
-
-- Improved :doc:`performance-move-const-arg
-  <clang-tidy/checks/performance/move-const-arg>` check by fixing false
-  negatives on ternary operators calling ``std::move``.
-
-- Improved :doc:`performance-unnecessary-value-param
-  <clang-tidy/checks/performance/unnecessary-value-param>` check performance by
-  tolerating fix-it breaking compilation when functions is used as pointers
-  to avoid matching usage of functions within the current compilation unit.
-  Added an option `IgnoreCoroutines` with the default value `true` to
-  suppress this check for coroutines where passing by reference may be unsafe.
-
-- Improved :doc:`readability-convert-member-functions-to-static
-  <clang-tidy/checks/readability/convert-member-functions-to-static>` check by
-  fixing false positives on member functions with an explicit object parameter.
-
-- Improved :doc:`readability-function-size
-  <clang-tidy/checks/readability/function-size>` check by adding new option
-  `CountMemberInitAsStmt` that allows counting class member initializers in
-  constructors as statements.
-
-- Improved :doc:`readability-math-missing-parentheses
-  <clang-tidy/checks/readability/math-missing-parentheses>` check by fixing
-  false negatives where math expressions are the operand of assignment operators
-  or comparison operators.
-
-- Improved :doc:`readability-named-parameter
-  <clang-tidy/checks/readability/named-parameter>` check by adding the option
-  `InsertPlainNamesInForwardDecls` to insert parameter names without comments
-  for forward declarations only.
-
-- Improved :doc:`readability-qualified-auto
-  <clang-tidy/checks/readability/qualified-auto>` check by adding the option
-  `AllowedTypes`, that excludes specified types from adding qualifiers.
-
-- Improved :doc:`readability-redundant-inline-specifier
-  <clang-tidy/checks/readability/redundant-inline-specifier>` check by fixing
-  false positives on out-of-line explicitly defaulted functions.
-
-- Improved :doc:`readability-redundant-smartptr-get
-  <clang-tidy/checks/readability/redundant-smartptr-get>` check by fixing
-  some false positives involving smart pointers to arrays.
-
 Removed checks
 ^^^^^^^^^^^^^^
 
@@ -409,3 +128,4 @@ Improvements to pp-trace
 
 Clang-tidy Visual Studio plugin
 -------------------------------
+
diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst
index 9611c655886f2..66c0abadc6a40 100644
--- a/clang-tools-extra/docs/clang-tidy/Contributing.rst
+++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst
@@ -657,6 +657,29 @@ directory.  The path to this directory is available in a lit test with the varia
 .. _FileCheck: https://llvm.org/docs/CommandGuide/FileCheck.html
 .. _test/clang-tidy/checkers/google/readability-casting.cpp: https://github.com/llvm/llvm-project/blob/main/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
 
+
+Submitting a Pull Request
+-------------------------
+
+Before submitting a pull request, contributors are encouraged to run
+:program:`clang-tidy` and :program:`clang-format` on their changes to ensure
+code quality and catch potential issues. While :program:`clang-tidy` is not
+currently enforced in CI, following this practice helps maintain code
+consistency and prevent common errors.
+
+Here's useful command to check your staged changes:
+
+.. code-block:: console
+
+  $ git diff --staged -U0 | ./clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py \
+      -j $(nproc) -path build/ -p1 -only-check-in-db
+  $ git clang-format
+
+Note that some warnings may be false positives or require careful consideration
+before fixing. Use your judgment and feel free to discuss in the pull request
+if you're unsure about a particular warning.
+
+
 Out-of-tree check plugins
 -------------------------
 
diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
index baff90faa6eae..49cc13606f4c2 100644
--- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
+++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -140,7 +140,6 @@ class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
       return true;
     switch (Qual->getKind()) {
     case NestedNameSpecifier::Namespace:
-    case NestedNameSpecifier::NamespaceAlias:
     case NestedNameSpecifier::Global:
       return true;
     case NestedNameSpecifier::TypeSpec:
diff --git a/clang-tools-extra/test/clang-query/trailing-comma.c b/clang-tools-extra/test/clang-query/trailing-comma.c
new file mode 100644
index 0000000000000..4fc0cb0f24187
--- /dev/null
+++ b/clang-tools-extra/test/clang-query/trailing-comma.c
@@ -0,0 +1,21 @@
+void foo(void) {}
+// CHECK-OK: trailing-comma.c:1:1: note: "root" binds here
+// CHECK-ERR-COMMA: Invalid token <,> found when looking for a value.
+
+// RUN: clang-query -c "match \
+// RUN:   functionDecl( \
+// RUN:     hasName( \
+// RUN:       \"foo\", \
+// RUN:     ), \
+// RUN:   ) \
+// RUN: " %s | FileCheck --check-prefix=CHECK-OK %s
+
+// Same with \n tokens
+// RUN: echo "match functionDecl( hasName( \"foo\" , ) , )" | sed "s/ /\n/g" >%t
+// RUN: clang-query -f %t %s | FileCheck --check-prefix=CHECK-OK %s
+
+// RUN: not clang-query -c "match functionDecl(hasName(\"foo\"),,)" %s \
+// RUN:   | FileCheck --check-prefix=CHECK-ERR-COMMA %s
+
+// RUN: not clang-query -c "match functionDecl(,)" %s \
+// RUN:   | FileCheck --check-prefix=CHECK-ERR-COMMA %s
diff --git a/clang/docs/CXXTypeAwareAllocators.rst b/clang/docs/CXXTypeAwareAllocators.rst
new file mode 100644
index 0000000000000..4cf5f7817ac3e
--- /dev/null
+++ b/clang/docs/CXXTypeAwareAllocators.rst
@@ -0,0 +1,155 @@
+=========================
+C++ Type Aware Allocators
+=========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Clang includes an implementation of P2719 "Type-aware allocation and deallocation
+functions".
+
+This is a feature that extends the semantics of `new`, `new[]`, `delete` and
+`delete[]` operators to expose the type being allocated to the operator. This
+can be used to customize allocation of types without needing to modify the
+type declaration, or via template definitions fully generic type aware
+allocators.
+
+P2719 introduces a type-identity tag as valid parameter type for all allocation
+operators. This tag is a default initialized value of type `std::type_identity<T>`
+where T is the type being allocated or deallocated.  Unlike the other placement
+arguments this tag is passed as the first parameter to the operator.
+
+The most basic use case is as follows
+
+.. code-block:: c++
+
+  #include <new>
+  #include <type_traits>
+
+  struct S {
+   // ...
+  };
+
+  void *operator new(std::type_identity<S>, size_t, std::align_val_t);
+  void operator delete(std::type_identity<S>, void *, size_t, std::align_val_t);
+
+  void f() {
+    S *s = new S; // calls ::operator new(std::type_identity<S>(), sizeof(S), alignof(S))
+    delete s; // calls ::operator delete(std::type_identity<S>(), s, sizeof(S), alignof(S))
+  }
+
+While this functionality alone is powerful and useful, the true power comes
+by using templates. In addition to adding the type-identity tag, P2719 allows
+the tag parameter to be a dependent specialization of `std::type_identity`,
+updates the overload resolution rules to support full template deduction and
+constraint semantics, and updates the definition of usual deallocation functions
+to include `operator delete` definitions that are templatized on the
+type-identity tag.
+
+This allows arbitrarily constrained definitions of the operators that resolve
+as would be expected for any other template function resolution, e.g (only
+showing `operator new` for brevity)
+
+.. code-block:: c++
+
+   template <typename T, unsigned Size> struct Array {
+     T buffer[Size];
+   };
+
+   // Starting with a concrete type
+   void *operator new(std::type_identity<Array<int, 5>>, size_t, std::align_val_t);
+
+   // Only care about five element arrays
+   template <typename T>
+   void *operator new(std::type_identity<Array<T, 5>>, size_t, std::align_val_t);
+
+   // An array of N floats
+   template <unsigned N>
+   void *operator new(std::type_identity<Array<float, N>>, size_t, std::align_val_t);
+
+   // Any array
+   template <typename T, unsigned N>
+   void *operator new(std::type_identity<Array<T, N>>, size_t, std::align_val_t);
+
+   // A handy concept
+   template <typename T> concept Polymorphic = std::is_polymorphic_v<T>;
+
+   // Only applies is T is Polymorphic
+   template <Polymorphic T, unsigned N>
+   void *operator new(std::type_identity<Array<T, N>>, size_t, std::align_val_t);
+
+   // Any even length array
+   template <typename T, unsigned N>
+   void *operator new(std::type_identity<Array<T, N>>, size_t, std::align_val_t)
+       requires(N%2 == 0);
+
+Operator selection then proceeds according to the usual rules for choosing
+the best/most constrained match.
+
+Any declaration of a type aware operator new or operator delete must include a
+matching complimentary operator defined in the same scope.
+
+Notes
+=====
+
+Unconstrained Global Operators
+------------------------------
+
+Declaring an unconstrained type aware global operator `new` or `delete` (or
+`[]` variants) creates numerous hazards, similar to, but different from, those
+created by attempting to replace the non-type aware global operators. For that
+reason unconstrained operators are strongly discouraged.
+
+Mismatching Constraints
+-----------------------
+
+When declaring global type aware operators you should ensure the constraints
+applied to new and delete match exactly, and declare them together. This
+limits the risk of having mismatching operators selected due to differing
+constraints resulting in changes to prioritization when determining the most
+viable candidate.
+
+Declarations Across Libraries
+-----------------------------
+
+Declaring a typed allocator for a type in a separate TU or library creates
+similar hazards as different libraries and TUs may see (or select) different
+definitions.
+
+Under this model something like this would be risky
+
+.. code-block:: c++
+
+  template<typename T>
+  void *operator new(std::type_identity<std::vector<T>>, size_t, std::align_val_t);
+
+However this hazard is not present simply due to the use of the a type from
+another library:
+
+.. code-block:: c++
+
+  template<typename T>
+  struct MyType {
+    T thing;
+  };
+  template<typename T>
+  void *operator new(std::type_identity<MyType<std::vector<T>>>, size_t, std::align_val_t);
+
+Here we see `std::vector` being used, but that is not the actual type being
+allocated.
+
+Implicit and Placement Parameters
+---------------------------------
+
+Type aware allocators are always passed both the implicit alignment and size
+parameters in all cases. Explicit placement parameters are supported after the
+mandatory implicit parameters.
+
+Publication
+===========
+
+`Type-aware allocation and deallocation functions <https://wg21.link/P2719>`_.
+Louis Dionne, Oliver Hunt.
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 0e21ef0244f78..bfc8094f3f50c 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -6015,6 +6015,16 @@ the configuration (without a prefix: ``Auto``).
        #include "B/A.h"           #include "B/a.h"
        #include "B/a.h"           #include "a/b.h"
 
+  * ``bool IgnoreExtension`` When sorting includes in each block, only take file extensions into
+    account if two includes compare equal otherwise.
+
+    .. code-block:: c++
+
+       true:                          false:
+       # include "A.h"         vs.    # include "A-util.h"
+       # include "A.inc"              # include "A.h"
+       # include "A-util.h"           # include "A.inc"
+
 
 .. _SortJavaStaticImport:
 
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a42a546555716..f448a9a8db172 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -15,6 +15,7 @@ Clang Language Extensions
    AutomaticReferenceCounting
    PointerAuthentication
    MatrixTypes
+   CXXTypeAwareAllocators
 
 Introduction
 ============
@@ -1540,6 +1541,13 @@ Use ``__has_feature(cxx_variable_templates)`` or
 ``__has_extension(cxx_variable_templates)`` to determine if support for
 templated variable declarations is enabled.
 
+C++ type aware allocators
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use ``__has_extension(cxx_type_aware_allocators)`` to determine the existence of
+support for the future C++2d type aware allocator feature. For full details see
+:doc:`C++ Type Aware Allocators <CXXTypeAwareAllocators>` for additional details.
+
 C11
 ---
 
diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst
index 6c2b11ac7fc23..e747022b9c173 100644
--- a/clang/docs/LibClang.rst
+++ b/clang/docs/LibClang.rst
@@ -404,3 +404,9 @@ following situations are explicitly unsupported:
   compatible across library versions.
 * For the same reason as above, serializing objects from one version of the
   library and deserializing with a different version is also not supported.
+
+Note: because libclang is a wrapper around the compiler frontend, it is not a
+`security-sensitive component`_ of the LLVM Project. Consider using a sandbox
+or some other mitigation approach if processing untrusted input.
+
+.. _security-sensitive component: https://llvm.org/docs/Security.html#what-is-considered-a-security-issue
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 970825c98fec1..ea16029268dba 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -36,7 +36,7 @@ Potentially Breaking Changes
 
 - The Objective-C ARC migrator (ARCMigrate) has been removed.
 - Fix missing diagnostics for uses of declarations when performing typename access,
-  such as when performing member access on a '[[deprecated]]' type alias.
+  such as when performing member access on a ``[[deprecated]]`` type alias.
   (#GH58547)
 - For ARM targets when compiling assembly files, the features included in the selected CPU
   or Architecture's FPU are included. If you wish not to use a specific feature,
@@ -133,8 +133,6 @@ C++2c Feature Support
 
 - Implemented `P0963R3 Structured binding declaration as a condition <https://wg21.link/P0963R3>`_.
 
-- Implemented `P2719R4 Type-aware allocation and deallocation functions <https://wg21.link/P2719>`_.
-
 - Implemented `P3618R0 Allow attaching main to the global module <https://wg21.link/P3618>`_.
 
 C++23 Feature Support
@@ -676,7 +674,7 @@ Improvements to Clang's diagnostics
   #GH142457, #GH139913, #GH138850, #GH137867, #GH137860, #GH107840, #GH93308,
   #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490,
   #GH36703, #GH32903, #GH23312, #GH69874.
-  
+
 - Clang no longer emits a spurious -Wdangling-gsl warning in C++23 when
   iterating over an element of a temporary container in a range-based
   for loop.(#GH109793, #GH145164)
@@ -707,6 +705,17 @@ Improvements to Clang's diagnostics
 - Improve the diagnostics for placement new expression when const-qualified
   object was passed as the storage argument. (#GH143708)
 
+- Clang now does not issue a warning about returning from a function declared with
+  the ``[[noreturn]]`` attribute when the function body is ended with a call via
+  pointer, provided it can be proven that the pointer only points to
+  ``[[noreturn]]`` functions.
+
+- Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic
+  diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``).
+  Moved the warning for a missing (though implied) attribute on a redeclaration into this group.
+  Added a new warning in this group for the case where the attribute is missing/implicit on
+  an override of a virtual method.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -797,6 +806,9 @@ Bug Fixes in This Version
   declaration statements. Clang now emits a warning for these patterns. (#GH141659)
 - Fixed false positives for redeclaration errors of using enum in
   nested scopes. (#GH147495)
+- Fixed a failed assertion with an operator call expression which comes from a
+  macro expansion when performing analysis for nullability attributes. (#GH138371)
+- Fixed a concept equivalent checking crash due to untransformed constraint expressions. (#GH146614)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -950,6 +962,7 @@ Bug Fixes to C++ Support
 - Fix a bug where private access specifier of overloaded function not respected. (#GH107629)
 - Correctly handles calling an explicit object member function template overload set
   through its address (``(&Foo::bar<baz>)()``).
+- Fix a crash when using an explicit object parameter in a non-member function. (#GH113185)
 - Fix a crash when forming an invalid call to an operator with an explicit object member. (#GH147121)
 - Correctly handle allocations in the condition of a ``if constexpr``.(#GH120197) (#GH134820)
 - Fixed a crash when handling invalid member using-declaration in C++20+ mode. (#GH63254)
@@ -963,6 +976,8 @@ Bug Fixes to C++ Support
 - Fix a crash with NTTP when instantiating local class.
 - Fixed a crash involving list-initialization of an empty class with a
   non-empty initializer list. (#GH147949)
+- Fixed constant evaluation of equality comparisons of constexpr-unknown references. (#GH147663)
+- Diagnose binding a reference to ``*nullptr`` during constant evaluation. (#GH48665)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1024,7 +1039,9 @@ Arm and AArch64 Support
   `as specified here <https://github.com/ARM-software/acle/blob/main/main/acle.md#modal-8-bit-floating-point-extensions>`_
   is now available.
 - Support has been added for the following processors (command-line identifiers in parentheses):
+
   - Arm Cortex-A320 (``cortex-a320``)
+
 - For ARM targets, cc1as now considers the FPU's features for the selected CPU or Architecture.
 - The ``+nosimd`` attribute is now fully supported for ARM. Previously, this had no effect when being used with
   ARM targets, however this will now disable NEON instructions being generated. The ``simd`` option is
@@ -1032,7 +1049,19 @@ Arm and AArch64 Support
 - When a feature that depends on NEON (``simd``) is used, NEON is now automatically enabled.
 - When NEON is disabled (``+nosimd``), all features that depend on NEON will now be disabled.
 
--  Support for __ptrauth type qualifier has been added.
+- Pointer authentication
+
+  - Support for __ptrauth type qualifier has been added.
+  - Objective-C adoption of pointer authentication
+
+    - ``isa`` and ``super`` pointers are protected with address diversity and separate
+      usage specific discriminators.
+    - methodlist pointers and content are protected with address diversity and methodlist
+      pointers have a usage specific discriminator.
+    - ``class_ro_t`` pointers are protected with address diversity and usage specific
+      discriminators.
+    - ``SEL`` typed ivars are protected with address diversity and usage specific
+      discriminators.
 
 - For AArch64, added support for generating executable-only code sections by using the
   ``-mexecute-only`` or ``-mpure-code`` compiler flags. (#GH125688)
@@ -1125,6 +1154,8 @@ Fixed Point Support in Clang
 AST Matchers
 ------------
 
+- Ensure ``hasBitWidth`` doesn't crash on bit widths that are dependent on template
+  parameters.
 - Ensure ``isDerivedFrom`` matches the correct base in case more than one alias exists.
 - Extend ``templateArgumentCountIs`` to support function and variable template
   specialization.
@@ -1176,6 +1207,8 @@ Static Analyzer
 ---------------
 - Fixed a crash when C++20 parenthesized initializer lists are used. This issue
   was causing a crash in clang-tidy. (#GH136041)
+- The Clang Static Analyzer now handles parenthesized initialization.
+  (#GH148875)
 
 New features
 ^^^^^^^^^^^^
@@ -1196,6 +1229,9 @@ New features
   so frequent 'not yet implemented' diagnostics should be expected.  Also, the
   ACC MLIR dialect does not currently implement any lowering to LLVM-IR, so no
   code generation is possible for OpenACC.
+- Implemented `P2719R5 Type-aware allocation and deallocation functions <https://wg21.link/P2719>`_
+  as an extension in all C++ language modes.
+
 
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/SanitizerSpecialCaseList.rst b/clang/docs/SanitizerSpecialCaseList.rst
index 2c50778d0f491..194f2fc5a7825 100644
--- a/clang/docs/SanitizerSpecialCaseList.rst
+++ b/clang/docs/SanitizerSpecialCaseList.rst
@@ -39,6 +39,7 @@ Example
   void bad_foo() {
     int *a = (int*)malloc(40);
     a[10] = 1;
+    free(a);
   }
   int main() { bad_foo(); }
   $ cat ignorelist.txt
diff --git a/clang/docs/ThinLTO.rst b/clang/docs/ThinLTO.rst
index c042547678919..569405ff5b2b7 100644
--- a/clang/docs/ThinLTO.rst
+++ b/clang/docs/ThinLTO.rst
@@ -240,6 +240,38 @@ The ``BOOTSTRAP_LLVM_ENABLE_LTO=Thin`` will enable ThinLTO for stage 2 and
 stage 3 in case the compiler used for stage 1 does not support the ThinLTO
 option.
 
+Integrated Distributed ThinLTO (DTLTO)
+--------------------------------------
+
+Integrated Distributed ThinLTO (DTLTO) enables the distribution of backend
+ThinLTO compilations via external distribution systems, such as Incredibuild,
+during the traditional link step.
+
+The implementation is documented here: https://llvm.org/docs/DTLTO.html.
+
+DTLTO requires the LLD linker (``-fuse-ld=lld``).
+
+``-fthinlto-distributor=<path>``
+   - Specifies the ``<path>`` to the distributor process executable for DTLTO.
+   - If specified, ThinLTO backend compilations will be distributed by LLD.
+
+``-Xthinlto-distributor=<arg>``
+   - Passes ``<arg>`` to the distributor process (see ``-fthinlto-distributor=``).
+   - Can be specified multiple times to pass multiple options.
+   - Multiple options can also be specified by separating them with commas.
+
+Examples:
+   - ``clang -flto=thin -fthinlto-distributor=incredibuild.exe -Xthinlto-distributor=--verbose,--j10 -fuse-ld=lld``
+   - ``clang -flto=thin -fthinlto-distributor=$(which python) -Xthinlto-distributor=incredibuild.py -fuse-ld=lld``
+
+If ``-fthinlto-distributor=`` is specified, Clang supplies the path to a
+compiler to be executed remotely to perform the ThinLTO backend
+compilations. Currently, this is Clang itself.
+
+Note that currently, DTLTO is only supported in some LLD flavors. Support can
+be added to other LLD flavours in the future.
+See `DTLTO <https://lld.llvm.org/dtlto.html>`_ for more information.
+
 More Information
 ================
 
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index 6c792af66a62c..4871d05e932ae 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -61,6 +61,7 @@ Using Clang as a Compiler
    APINotes
    DebuggingCoroutines
    AMDGPUSupport
+   CXXTypeAwareAllocators
    CommandGuide/index
    FAQ
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index c35311c886413..b929585205aee 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -6953,6 +6953,21 @@ clang_getCursorUnaryOperatorKind(CXCursor cursor);
  * @}
  */
 
+CINDEX_DEPRECATED
+typedef void *CXRemapping;
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping clang_getRemappings(const char *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping
+clang_getRemappingsFromFileList(const char **, unsigned);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE unsigned clang_remap_getNumFiles(CXRemapping);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void
+clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void clang_remap_dispose(CXRemapping);
+
 LLVM_CLANG_C_EXTERN_C_END
 
 #endif
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 2b9cd035623cc..66ec3395571ea 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2300,6 +2300,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
     return getTypeDeclType(getObjCSelDecl());
   }
 
+  PointerAuthQualifier getObjCMemberSelTypePtrAuth();
+
   /// Retrieve the typedef declaration corresponding to the predefined
   /// Objective-C 'Class' type.
   TypedefDecl *getObjCClassDecl() const;
@@ -2914,10 +2916,14 @@ class ASTContext : public RefCountedBase<ASTContext> {
   NestedNameSpecifier *
   getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const;
 
-  /// Retrieves the default calling convention for the current target.
+  /// Retrieves the default calling convention for the current context.
+  ///
+  /// The context's default calling convention may differ from the current
+  /// target's default calling convention if the -fdefault-calling-conv option
+  /// is used; to get the target's default calling convention, e.g. for built-in
+  /// functions, call getTargetInfo().getDefaultCallingConv() instead.
   CallingConv getDefaultCallingConvention(bool IsVariadic,
-                                          bool IsCXXMethod,
-                                          bool IsBuiltin = false) const;
+                                          bool IsCXXMethod) const;
 
   /// Retrieves the "canonical" template name that refers to a
   /// given template.
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index 514f4cef3a694..0a2db9e205c7c 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -269,12 +269,7 @@ class DataStreamBasicReader : public BasicReaderBase<Impl> {
 
       case NestedNameSpecifier::Namespace:
         cur = NestedNameSpecifier::Create(ctx, cur,
-                                          asImpl().readNamespaceDeclRef());
-        continue;
-
-      case NestedNameSpecifier::NamespaceAlias:
-        cur = NestedNameSpecifier::Create(ctx, cur,
-                                     asImpl().readNamespaceAliasDeclRef());
+                                          asImpl().readNamespaceBaseDeclRef());
         continue;
 
       case NestedNameSpecifier::TypeSpec:
diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h
index fedde8a2e46c5..c105bbbe45c92 100644
--- a/clang/include/clang/AST/AbstractBasicWriter.h
+++ b/clang/include/clang/AST/AbstractBasicWriter.h
@@ -251,11 +251,7 @@ class DataStreamBasicWriter : public BasicWriterBase<Impl> {
         continue;
 
       case NestedNameSpecifier::Namespace:
-        asImpl().writeNamespaceDeclRef(NNS->getAsNamespace());
-        continue;
-
-      case NestedNameSpecifier::NamespaceAlias:
-        asImpl().writeNamespaceAliasDeclRef(NNS->getAsNamespaceAlias());
+        asImpl().writeNamespaceBaseDeclRef(NNS->getAsNamespace());
         continue;
 
       case NestedNameSpecifier::TypeSpec:
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 3d7969cca83fd..08fe1f881503b 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -565,8 +565,28 @@ class LabelDecl : public NamedDecl {
   static bool classofKind(Kind K) { return K == Label; }
 };
 
+/// Represents C++ namespaces and their aliases.
+///
+/// FIXME: Move `NamespaceBaseDecl` and `NamespaceDecl` to "DeclCXX.h" or
+/// explain why not moving.
+class NamespaceBaseDecl : public NamedDecl {
+protected:
+  using NamedDecl::NamedDecl;
+
+public:
+  NamespaceDecl *getNamespace();
+  const NamespaceDecl *getNamespace() const {
+    return const_cast<NamespaceBaseDecl *>(this)->getNamespace();
+  }
+
+  static bool classof(const Decl *D) { return classofKind(D->getKind()); }
+  static bool classofKind(Kind K) {
+    return K >= firstNamespaceBase && K <= lastNamespaceBase;
+  }
+};
+
 /// Represent a C++ namespace.
-class NamespaceDecl : public NamedDecl,
+class NamespaceDecl : public NamespaceBaseDecl,
                       public DeclContext,
                       public Redeclarable<NamespaceDecl> {
   /// The starting location of the source range, pointing
@@ -3252,6 +3272,11 @@ class FieldDecl : public DeclaratorDecl, public Mergeable<FieldDecl> {
     return hasInClassInitializer() ? InitAndBitWidth->BitWidth : BitWidth;
   }
 
+  /// Determines whether the bit width of this field is a constant integer.
+  /// This may not always be the case, such as inside template-dependent
+  /// expressions.
+  bool hasConstantIntegerBitWidth() const;
+
   /// Computes the bit width of this field, if this is a bit field.
   /// May not be called on non-bitfields.
   /// Note that in order to successfully use this function, the bitwidth
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 77bc3cad72ed9..33ae3d604020b 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -3186,7 +3186,7 @@ class UsingDirectiveDecl : public NamedDecl {
 /// \code
 /// namespace Foo = Bar;
 /// \endcode
-class NamespaceAliasDecl : public NamedDecl,
+class NamespaceAliasDecl : public NamespaceBaseDecl,
                            public Redeclarable<NamespaceAliasDecl> {
   friend class ASTDeclReader;
 
@@ -3203,14 +3203,14 @@ class NamespaceAliasDecl : public NamedDecl,
 
   /// The Decl that this alias points to, either a NamespaceDecl or
   /// a NamespaceAliasDecl.
-  NamedDecl *Namespace;
+  NamespaceBaseDecl *Namespace;
 
   NamespaceAliasDecl(ASTContext &C, DeclContext *DC,
                      SourceLocation NamespaceLoc, SourceLocation AliasLoc,
                      IdentifierInfo *Alias, NestedNameSpecifierLoc QualifierLoc,
-                     SourceLocation IdentLoc, NamedDecl *Namespace)
-      : NamedDecl(NamespaceAlias, DC, AliasLoc, Alias), redeclarable_base(C),
-        NamespaceLoc(NamespaceLoc), IdentLoc(IdentLoc),
+                     SourceLocation IdentLoc, NamespaceBaseDecl *Namespace)
+      : NamespaceBaseDecl(NamespaceAlias, DC, AliasLoc, Alias),
+        redeclarable_base(C), NamespaceLoc(NamespaceLoc), IdentLoc(IdentLoc),
         QualifierLoc(QualifierLoc), Namespace(Namespace) {}
 
   void anchor() override;
@@ -3222,13 +3222,11 @@ class NamespaceAliasDecl : public NamedDecl,
   NamespaceAliasDecl *getMostRecentDeclImpl() override;
 
 public:
-  static NamespaceAliasDecl *Create(ASTContext &C, DeclContext *DC,
-                                    SourceLocation NamespaceLoc,
-                                    SourceLocation AliasLoc,
-                                    IdentifierInfo *Alias,
-                                    NestedNameSpecifierLoc QualifierLoc,
-                                    SourceLocation IdentLoc,
-                                    NamedDecl *Namespace);
+  static NamespaceAliasDecl *
+  Create(ASTContext &C, DeclContext *DC, SourceLocation NamespaceLoc,
+         SourceLocation AliasLoc, IdentifierInfo *Alias,
+         NestedNameSpecifierLoc QualifierLoc, SourceLocation IdentLoc,
+         NamespaceBaseDecl *Namespace);
 
   static NamespaceAliasDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID);
 
@@ -3282,7 +3280,7 @@ class NamespaceAliasDecl : public NamedDecl,
 
   /// Retrieve the namespace that this alias refers to, which
   /// may either be a NamespaceDecl or a NamespaceAliasDecl.
-  NamedDecl *getAliasedNamespace() const { return Namespace; }
+  NamespaceBaseDecl *getAliasedNamespace() const { return Namespace; }
 
   SourceRange getSourceRange() const override LLVM_READONLY {
     return SourceRange(NamespaceLoc, IdentLoc);
diff --git a/clang/include/clang/AST/NestedNameSpecifier.h b/clang/include/clang/AST/NestedNameSpecifier.h
index 952c79753d10a..1614f9d7c94e4 100644
--- a/clang/include/clang/AST/NestedNameSpecifier.h
+++ b/clang/include/clang/AST/NestedNameSpecifier.h
@@ -31,8 +31,7 @@ class ASTContext;
 class CXXRecordDecl;
 class IdentifierInfo;
 class LangOptions;
-class NamespaceAliasDecl;
-class NamespaceDecl;
+class NamespaceBaseDecl;
 struct PrintingPolicy;
 class Type;
 class TypeLoc;
@@ -79,12 +78,9 @@ class NestedNameSpecifier : public llvm::FoldingSetNode {
     /// An identifier, stored as an IdentifierInfo*.
     Identifier,
 
-    /// A namespace, stored as a NamespaceDecl*.
+    /// A namespace-like entity, stored as a NamespaceBaseDecl*.
     Namespace,
 
-    /// A namespace alias, stored as a NamespaceAliasDecl*.
-    NamespaceAlias,
-
     /// A type, stored as a Type*.
     TypeSpec,
 
@@ -121,15 +117,10 @@ class NestedNameSpecifier : public llvm::FoldingSetNode {
                                      NestedNameSpecifier *Prefix,
                                      const IdentifierInfo *II);
 
-  /// Builds a nested name specifier that names a namespace.
-  static NestedNameSpecifier *Create(const ASTContext &Context,
-                                     NestedNameSpecifier *Prefix,
-                                     const NamespaceDecl *NS);
-
-  /// Builds a nested name specifier that names a namespace alias.
+  /// Builds a nested name specifier that names a namespace or namespace alias.
   static NestedNameSpecifier *Create(const ASTContext &Context,
                                      NestedNameSpecifier *Prefix,
-                                     const NamespaceAliasDecl *Alias);
+                                     const NamespaceBaseDecl *NS);
 
   /// Builds a nested name specifier that names a type.
   static NestedNameSpecifier *
@@ -174,13 +165,9 @@ class NestedNameSpecifier : public llvm::FoldingSetNode {
     return nullptr;
   }
 
-  /// Retrieve the namespace stored in this nested name
+  /// Retrieve the namespace or namespace alias stored in this nested name
   /// specifier.
-  NamespaceDecl *getAsNamespace() const;
-
-  /// Retrieve the namespace alias stored in this nested name
-  /// specifier.
-  NamespaceAliasDecl *getAsNamespaceAlias() const;
+  NamespaceBaseDecl *getAsNamespace() const;
 
   /// Retrieve the record declaration stored in this nested name
   /// specifier.
@@ -425,29 +412,15 @@ class NestedNameSpecifierLocBuilder {
   /// \param Context The AST context in which this nested-name-specifier
   /// resides.
   ///
-  /// \param Namespace The namespace.
+  /// \param Namespace The namespace or namespace alias.
   ///
-  /// \param NamespaceLoc The location of the namespace name.
+  /// \param NamespaceLoc The location of the namespace name or the namespace
+  //  alias.
   ///
   /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceDecl *Namespace,
+  void Extend(ASTContext &Context, NamespaceBaseDecl *Namespace,
               SourceLocation NamespaceLoc, SourceLocation ColonColonLoc);
 
-  /// Extend the current nested-name-specifier by another
-  /// nested-name-specifier component of the form 'namespace-alias::'.
-  ///
-  /// \param Context The AST context in which this nested-name-specifier
-  /// resides.
-  ///
-  /// \param Alias The namespace alias.
-  ///
-  /// \param AliasLoc The location of the namespace alias
-  /// name.
-  ///
-  /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceAliasDecl *Alias,
-              SourceLocation AliasLoc, SourceLocation ColonColonLoc);
-
   /// Turn this (empty) nested-name-specifier into the global
   /// nested-name-specifier '::'.
   void MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc);
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index 1215056ffde1b..0438e4dfbafac 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -91,6 +91,7 @@ def DeclRef : RefPropertyType<"Decl"> { let ConstWhenWriting = 1; }
     SubclassPropertyType<"FunctionDecl", DeclRef>;
   def NamedDeclRef :
     SubclassPropertyType<"NamedDecl", DeclRef>;
+  def NamespaceBaseDeclRef : SubclassPropertyType<"NamespaceBaseDecl", DeclRef>;
   def NamespaceDeclRef :
     SubclassPropertyType<"NamespaceDecl", DeclRef>;
   def NamespaceAliasDeclRef :
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 5cb2f57edffe4..519a811775c01 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -789,7 +789,6 @@ bool RecursiveASTVisitor<Derived>::TraverseNestedNameSpecifier(
   switch (NNS->getKind()) {
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
     return true;
@@ -813,7 +812,6 @@ bool RecursiveASTVisitor<Derived>::TraverseNestedNameSpecifierLoc(
   switch (NNS.getNestedNameSpecifier()->getKind()) {
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
     return true;
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index ce5d529e813fd..08c898f7758ec 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -710,7 +710,8 @@ AST_MATCHER(FieldDecl, isBitField) {
 /// fieldDecl(hasBitWidth(2))
 ///   matches 'int a;' and 'int c;' but not 'int b;'.
 AST_MATCHER_P(FieldDecl, hasBitWidth, unsigned, Width) {
-  return Node.isBitField() && Node.getBitWidthValue() == Width;
+  return Node.isBitField() && Node.hasConstantIntegerBitWidth() &&
+         Node.getBitWidthValue() == Width;
 }
 
 /// Matches non-static data members that have an in-class initializer.
@@ -7893,9 +7894,9 @@ AST_MATCHER_P_OVERLOAD(NestedNameSpecifierLoc, hasPrefix,
 ///   matches "ns::"
 AST_MATCHER_P(NestedNameSpecifier, specifiesNamespace,
               internal::Matcher<NamespaceDecl>, InnerMatcher) {
-  if (!Node.getAsNamespace())
-    return false;
-  return InnerMatcher.matches(*Node.getAsNamespace(), Finder, Builder);
+  if (auto *NS = dyn_cast_if_present<NamespaceDecl>(Node.getAsNamespace()))
+    return InnerMatcher.matches(*NS, Finder, Builder);
+  return false;
 }
 
 /// Matches attributes.
diff --git a/clang/include/clang/Analysis/Analyses/UninitializedValues.h b/clang/include/clang/Analysis/Analyses/UninitializedValues.h
index b151bc3f58321..a9b9caf38e518 100644
--- a/clang/include/clang/Analysis/Analyses/UninitializedValues.h
+++ b/clang/include/clang/Analysis/Analyses/UninitializedValues.h
@@ -50,6 +50,9 @@ class UninitUse {
   /// Is this use a const reference to this variable?
   bool ConstRefUse = false;
 
+  /// Is this use a const pointer to this variable?
+  bool ConstPtrUse = false;
+
   /// This use is always uninitialized if it occurs after any of these branches
   /// is taken.
   SmallVector<Branch, 2> UninitBranches;
@@ -65,11 +68,14 @@ class UninitUse {
   void setUninitAfterCall() { UninitAfterCall = true; }
   void setUninitAfterDecl() { UninitAfterDecl = true; }
   void setConstRefUse() { ConstRefUse = true; }
+  void setConstPtrUse() { ConstPtrUse = true; }
 
   /// Get the expression containing the uninitialized use.
   const Expr *getUser() const { return User; }
 
   bool isConstRefUse() const { return ConstRefUse; }
+  bool isConstPtrUse() const { return ConstPtrUse; }
+  bool isConstRefOrPtrUse() const { return ConstRefUse || ConstPtrUse; }
 
   /// The kind of uninitialized use.
   enum Kind {
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index ab432b7a8ad58..ed51f1d5de447 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -164,6 +164,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n")
 BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
+TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi", "t", "vmem-to-lds-load-insts")
 
 //===----------------------------------------------------------------------===//
 // Ballot builtins.
@@ -668,12 +669,60 @@ TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep,  "vIs", "n", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tanhh, "hh", "nc", "tanh-insts")
 TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sin_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cos_bf16, "yy", "nc", "bf16-trans-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
 
+// GFX1250 WMMA builtins
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x32_bf16, "V8yIbV16yIbV16yIsV8yIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16, "V8yIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, "V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, "V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 354531e83991d..e7d6741eb695e 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -1092,6 +1092,12 @@ UNALIASED_CUSTOM_BUILTIN(mma_dmxvi8gerx4spp, "vW1024*W256V", true,
                          "mma,paired-vector-memops")
 UNALIASED_CUSTOM_BUILTIN(mma_pmdmxvi8gerx4spp, "vW1024*W256Vi255i15i15", true,
                          "mma,paired-vector-memops")
+UNALIASED_CUSTOM_BUILTIN(mma_dmsetdmrz, "vW1024*", false,
+                         "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_dmmr, "vW1024*W1024*", false,
+                         "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_dmxor, "vW1024*W1024*", true,
+                         "mma,isa-future-instructions")
 
 // MMA builtins with positive/negative multiply/accumulate.
 UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",
diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td
index b2cd5648e008f..5927eaf80d57a 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.td
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -157,3 +157,8 @@ def pause : RISCVBuiltin<"void()">;
 // XCV extensions.
 //===----------------------------------------------------------------------===//
 include "clang/Basic/BuiltinsRISCVXCV.td"
+
+//===----------------------------------------------------------------------===//
+// XAndes extensions.
+//===----------------------------------------------------------------------===//
+include "clang/Basic/BuiltinsRISCVXAndes.td"
diff --git a/clang/include/clang/Basic/BuiltinsRISCVXAndes.td b/clang/include/clang/Basic/BuiltinsRISCVXAndes.td
new file mode 100644
index 0000000000000..ea9a7166bc6e8
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsRISCVXAndes.td
@@ -0,0 +1,27 @@
+//==- BuiltinsRISCVXAndes.td - RISC-V Andes Builtin database -----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Andes-specific builtin function database.  Users of
+// this file must define the BUILTIN macro to make use of this information.
+//
+//===----------------------------------------------------------------------===//
+
+class RISCVXAndesBuiltin<string prototype, string features = ""> : TargetBuiltin {
+  let Spellings = ["__builtin_riscv_nds_" # NAME];
+  let Prototype = prototype;
+  let Features = features;
+}
+
+let Attributes = [NoThrow, Const] in {
+//===----------------------------------------------------------------------===//
+// XAndesBFHCvt extension.
+//===----------------------------------------------------------------------===//
+
+def fcvt_s_bf16 : RISCVXAndesBuiltin<"float(__bf16)", "xandesbfhcvt">;
+def fcvt_bf16_s : RISCVXAndesBuiltin<"__bf16(float)", "xandesbfhcvt">;
+} // Attributes = [NoThrow, Const]
diff --git a/clang/include/clang/Basic/BuiltinsSPIRVVK.td b/clang/include/clang/Basic/BuiltinsSPIRVVK.td
index 61cc0343c415e..5dc3c7588cd2a 100644
--- a/clang/include/clang/Basic/BuiltinsSPIRVVK.td
+++ b/clang/include/clang/Basic/BuiltinsSPIRVVK.td
@@ -11,3 +11,4 @@ include "clang/Basic/BuiltinsSPIRVBase.td"
 
 def reflect : SPIRVBuiltin<"void(...)", [NoThrow, Const]>;
 def faceforward : SPIRVBuiltin<"void(...)", [NoThrow, Const, CustomTypeChecking]>;
+def refract : SPIRVBuiltin<"void(...)", [NoThrow, Const, CustomTypeChecking]>;
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 41031026c99f6..cfffeb71f09d1 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -56,6 +56,8 @@ CODEGENOPT(XCOFFReadOnlyPointers, 1, 0, Benign) ///< Set for -mxcoff-roptr.
 CODEGENOPT(AllTocData, 1, 0, Benign) ///< AIX -mtocdata
 ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,reserved,none
 
+ENUM_CODEGENOPT(ExceptionHandling, ExceptionHandlingKind, 3, ExceptionHandlingKind::None, NotCompatible)
+
 CODEGENOPT(ClearASTBeforeBackend , 1, 0, Benign) ///< Free the AST before running backend code generation. Only works with -disable-free.
 CODEGENOPT(DisableFree       , 1, 0, Benign) ///< Don't free memory.
 CODEGENOPT(DiscardValueNames , 1, 0, Benign) ///< Discard Value Names from the IR (LLVMContext flag)
@@ -209,17 +211,17 @@ CODEGENOPT(ObjCAvoidHeapifyLocalBlocks, 1, 0, Benign)
 
 
 // The optimization options affect frontend options, which in turn do affect the AST.
-VALUE_CODEGENOPT(OptimizationLevel, 2, 0, Affecting) ///< The -O[0-3] option specified.
-VALUE_CODEGENOPT(OptimizeSize, 2, 0, Affecting) ///< If -Os (==1, Benign) or -Oz (==2, Benign) is specified.
+VALUE_CODEGENOPT(OptimizationLevel, 2, 0, Compatible) ///< The -O[0-3] option specified.
+VALUE_CODEGENOPT(OptimizeSize, 2, 0, Compatible) ///< If -Os (==1, Benign) or -Oz (==2, Benign) is specified.
 
 CODEGENOPT(AtomicProfileUpdate , 1, 0, Benign) ///< Set -fprofile-update=atomic
 CODEGENOPT(ContinuousProfileSync, 1, 0, Benign) ///< Enable continuous instrumentation profiling
 /// Choose profile instrumenation kind or no instrumentation.
 
-ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 4, llvm::driver::ProfileInstrKind::ProfileNone, Benign)
+ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 4, ProfileInstrKind::ProfileNone, Benign)
 
 /// Choose profile kind for PGO use compilation.
-ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone, Benign)
+ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileInstrKind::ProfileNone, Benign)
 /// Partition functions into N groups and select only functions in group i to be
 /// instrumented. Selected group numbers can be 0 to N-1 inclusive.
 VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1, Benign)
@@ -244,8 +246,8 @@ CODEGENOPT(SaveTempLabels    , 1, 0, Benign) ///< Save temporary labels.
 CODEGENOPT(SanitizeAddressUseAfterScope , 1, 0, Benign) ///< Enable use-after-scope detection
                                                         ///< in AddressSanitizer
 ENUM_CODEGENOPT(SanitizeAddressUseAfterReturn,
-                llvm::AsanDetectStackUseAfterReturnMode, 2,
-                llvm::AsanDetectStackUseAfterReturnMode::Runtime,
+                AsanDetectStackUseAfterReturnMode, 2,
+                AsanDetectStackUseAfterReturnMode::Runtime,
                 Benign
                 ) ///< Set detection mode for stack-use-after-return.
 CODEGENOPT(SanitizeAddressPoisonCustomArrayCookie, 1, 0, Benign) ///< Enable poisoning operator new[] which is not a replaceable
@@ -255,9 +257,9 @@ CODEGENOPT(SanitizeAddressGlobalsDeadStripping, 1, 0, Benign) ///< Enable linker
 CODEGENOPT(SanitizeAddressUseOdrIndicator, 1, 0, Benign) ///< Enable ODR indicator globals
 CODEGENOPT(SanitizeMemoryTrackOrigins, 2, 0, Benign) ///< Enable tracking origins in
                                                      ///< MemorySanitizer
-ENUM_CODEGENOPT(SanitizeAddressDtor, llvm::AsanDtorKind, 2,
-                llvm::AsanDtorKind::Global, Benign)  ///< Set how ASan global
-                                                     ///< destructors are emitted.
+ENUM_CODEGENOPT(SanitizeAddressDtor, AsanDtorKind, 2,
+                AsanDtorKind::Global, Benign)  ///< Set how ASan global
+                                               ///< destructors are emitted.
 CODEGENOPT(SanitizeMemoryParamRetval, 1, 0, Benign) ///< Enable detection of uninitialized
                                                     ///< parameters and return values
                                                     ///< in MemorySanitizer
@@ -375,13 +377,13 @@ VALUE_CODEGENOPT(SmallDataLimit, 32, 0, Benign)
 VALUE_CODEGENOPT(SSPBufferSize, 32, 0, Benign)
 
 /// The kind of inlining to perform.
-ENUM_CODEGENOPT(Inlining, InliningMethod, 2, NormalInlining, Benign)
+ENUM_CODEGENOPT(Inlining, InliningMethod, 2, NormalInlining, Compatible)
 
 /// The maximum stack size a function can have to be considered for inlining.
 VALUE_CODEGENOPT(InlineMaxStackSize, 32, UINT_MAX, Benign)
 
 // Vector functions library to use.
-ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 4, llvm::driver::VectorLibrary::NoLibrary, Benign)
+ENUM_CODEGENOPT(VecLib, VectorLibrary, 4, VectorLibrary::NoLibrary, Benign)
 
 /// The default TLS model to use.
 ENUM_CODEGENOPT(DefaultTLSModel, TLSModel, 2, GeneralDynamicTLSModel, Benign)
@@ -457,8 +459,8 @@ ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,
 CODEGENOPT(SkipRaxSetup, 1, 0, Benign)
 
 /// Whether to zero out caller-used registers before returning.
-ENUM_CODEGENOPT(ZeroCallUsedRegs, llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind,
-                5, llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind::Skip, Benign)
+ENUM_CODEGENOPT(ZeroCallUsedRegs, ZeroCallUsedRegsKind,
+                5, ZeroCallUsedRegsKind::Skip, Benign)
 
 /// Modify C++ ABI to returning `this` pointer from constructors and
 /// non-deleting destructors. (No effect on Microsoft ABI.)
@@ -477,8 +479,8 @@ CODEGENOPT(ResMayAlias, 1, 0, Benign)
 
 /// Controls how unwind v2 (epilog) information should be generated for x64
 /// Windows.
-ENUM_CODEGENOPT(WinX64EHUnwindV2, llvm::WinX64EHUnwindV2Mode,
-                2, llvm::WinX64EHUnwindV2Mode::Disabled, Benign)
+ENUM_CODEGENOPT(WinX64EHUnwindV2, WinX64EHUnwindV2Mode,
+                2, WinX64EHUnwindV2Mode::Disabled, Benign)
 
 /// FIXME: Make DebugOptions its own top-level .def file.
 #include "DebugOptions.def"
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index f6c61f202890e..cdeedd5b4eac6 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -46,13 +46,32 @@ class CodeGenOptionsBase {
   enum class CompatibilityKind {
     /// Does affect the construction of the AST in a way that does prevent
     /// module interoperability.
-    Affecting,
+    NotCompatible,
+    /// Does affect the construction of the AST in a way that doesn't prevent
+    /// interoperability (that is, the value can be different between an
+    /// explicit module and the user of that module).
+    Compatible,
     /// Does not affect the construction of the AST in any way (that is, the
     /// value can be different between an implicit module and the user of that
     /// module).
     Benign,
   };
 
+  using CFBranchLabelSchemeKind = clang::CFBranchLabelSchemeKind;
+  using ProfileInstrKind = llvm::driver::ProfileInstrKind;
+  using AsanDetectStackUseAfterReturnMode =
+      llvm::AsanDetectStackUseAfterReturnMode;
+  using AsanDtorKind = llvm::AsanDtorKind;
+  using VectorLibrary = llvm::driver::VectorLibrary;
+  using ZeroCallUsedRegsKind = llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind;
+  using WinX64EHUnwindV2Mode = llvm::WinX64EHUnwindV2Mode;
+
+  using DebugCompressionType = llvm::DebugCompressionType;
+  using EmitDwarfUnwindType = llvm::EmitDwarfUnwindType;
+  using DebugTemplateNamesKind = llvm::codegenoptions::DebugTemplateNamesKind;
+  using DebugInfoKind = llvm::codegenoptions::DebugInfoKind;
+  using DebuggerKind = llvm::DebuggerKind;
+
 #define CODEGENOPT(Name, Bits, Default, Compatibility) unsigned Name : Bits;
 #define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)
 #include "clang/Basic/CodeGenOptions.def"
@@ -157,6 +176,9 @@ class CodeGenOptions : public CodeGenOptionsBase {
     llvm_unreachable("invalid FramePointerKind");
   }
 
+  /// Possible exception handling behavior.
+  enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
+
   enum class SwiftAsyncFramePointerKind {
     Auto, // Choose Swift async extended frame info based on deployment target.
     Always, // Unconditionally emit Swift async extended frame info.
@@ -533,6 +555,22 @@ class CodeGenOptions : public CodeGenOptionsBase {
     return NoBuiltinFuncs;
   }
 
+  bool hasSjLjExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::SjLj;
+  }
+
+  bool hasSEHExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::WinEH;
+  }
+
+  bool hasDWARFExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::DwarfCFI;
+  }
+
+  bool hasWasmExceptions() const {
+    return getExceptionHandling() == ExceptionHandlingKind::Wasm;
+  }
+
   /// Check if Clang profile instrumenation is on.
   bool hasProfileClangInstr() const {
     return getProfileInstr() ==
diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def
index e2ab06184b96a..c6e736e92744c 100644
--- a/clang/include/clang/Basic/DebugOptions.def
+++ b/clang/include/clang/Basic/DebugOptions.def
@@ -28,9 +28,9 @@ VALUE_CODEGENOPT(Name, Bits, Default, Compatibility)
 ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)
 #endif
 
-ENUM_DEBUGOPT(CompressDebugSections, llvm::DebugCompressionType, 2,
-                     llvm::DebugCompressionType::None, Benign)
-DEBUGOPT(Dwarf64, 1, 0, Affecting) ///< -gdwarf64.
+ENUM_DEBUGOPT(CompressDebugSections, DebugCompressionType, 2,
+                     DebugCompressionType::None, Benign)
+DEBUGOPT(Dwarf64, 1, 0, Compatible) ///< -gdwarf64.
 DEBUGOPT(EnableDIPreservationVerify, 1, 0, Benign) ///< Enable di preservation
                                                    ///< verify each (it means
                                                    ///< check the original debug
@@ -40,8 +40,8 @@ DEBUGOPT(ForceDwarfFrameSection , 1, 0, Benign) ///< Set when -fforce-dwarf-fram
                                                 ///< is enabled.
 
 ///< Set when -femit-dwarf-unwind is passed.
-ENUM_DEBUGOPT(EmitDwarfUnwind, llvm::EmitDwarfUnwindType, 2,
-              llvm::EmitDwarfUnwindType::Default, Benign)
+ENUM_DEBUGOPT(EmitDwarfUnwind, EmitDwarfUnwindType, 2,
+              EmitDwarfUnwindType::Default, Benign)
 
 DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-asm
                                              ///< is enabled.
@@ -49,8 +49,8 @@ DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-
 DEBUGOPT(NoInlineLineTables, 1, 0, Benign) ///< Whether debug info should contain
                                            ///< inline line tables.
 
-DEBUGOPT(DebugStrictDwarf, 1, 1, Affecting) ///< Whether or not to use strict DWARF info.
-DEBUGOPT(DebugOmitUnreferencedMethods, 1, 0, Affecting) ///< Omit unreferenced member
+DEBUGOPT(DebugStrictDwarf, 1, 1, Compatible) ///< Whether or not to use strict DWARF info.
+DEBUGOPT(DebugOmitUnreferencedMethods, 1, 0, Compatible) ///< Omit unreferenced member
                                                         ///< functions in type debug info.
 
 /// Control the Assignment Tracking debug info feature.
@@ -60,73 +60,73 @@ ENUM_DEBUGOPT(AssignmentTrackingMode, AssignmentTrackingOpts, 2,
 /// Whether or not to use Key Instructions to determine breakpoint locations.
 DEBUGOPT(DebugKeyInstructions, 1, 0, Benign)
 
-DEBUGOPT(DebugColumnInfo, 1, 0, Affecting) ///< Whether or not to use column information
+DEBUGOPT(DebugColumnInfo, 1, 0, Compatible) ///< Whether or not to use column information
                                            ///< in debug info.
 
-DEBUGOPT(DebugTypeExtRefs, 1, 0, Affecting) ///< Whether or not debug info should contain
+DEBUGOPT(DebugTypeExtRefs, 1, 0, Compatible) ///< Whether or not debug info should contain
                                             ///< external references to a PCH or module.
 
-DEBUGOPT(DebugExplicitImport, 1, 0, Affecting)  ///< Whether or not debug info should
+DEBUGOPT(DebugExplicitImport, 1, 0, Compatible)  ///< Whether or not debug info should
                                                 ///< contain explicit imports for
                                                 ///< anonymous namespaces
 
 /// Set debug info source file hashing algorithm.
-ENUM_DEBUGOPT(DebugSrcHash, DebugSrcHashKind, 2, DSH_MD5, Affecting)
+ENUM_DEBUGOPT(DebugSrcHash, DebugSrcHashKind, 2, DSH_MD5, Compatible)
 
-DEBUGOPT(SplitDwarfInlining, 1, 1, Affecting) ///< Whether to include inlining info in the
+DEBUGOPT(SplitDwarfInlining, 1, 1, Compatible) ///< Whether to include inlining info in the
                                               ///< skeleton CU to allow for symbolication
                                               ///< of inline stack frames without .dwo files.
-DEBUGOPT(DebugFwdTemplateParams, 1, 0, Affecting) ///< Whether to emit complete
+DEBUGOPT(DebugFwdTemplateParams, 1, 0, Compatible) ///< Whether to emit complete
                                                   ///< template parameter descriptions in
                                                   ///< forward declarations (versus just
                                                   ///< including them in the name).
 ENUM_DEBUGOPT(DebugSimpleTemplateNames,
-              llvm::codegenoptions::DebugTemplateNamesKind, 2,
-              llvm::codegenoptions::DebugTemplateNamesKind::Full, Affecting)
+              DebugTemplateNamesKind, 2,
+              DebugTemplateNamesKind::Full, Compatible)
               ///< Whether to emit template parameters in the textual names of
               ///< template specializations.
               ///< Implies DebugFwdTemplateNames to allow decorated names to be
               ///< reconstructed when needed.
 
 /// The kind of generated debug info.
-ENUM_DEBUGOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4,
-              llvm::codegenoptions::NoDebugInfo, Affecting)
+ENUM_DEBUGOPT(DebugInfo, DebugInfoKind, 4,
+              DebugInfoKind::NoDebugInfo, Compatible)
 
 /// Whether to generate macro debug info.
-DEBUGOPT(MacroDebugInfo, 1, 0, Affecting)
+DEBUGOPT(MacroDebugInfo, 1, 0, Compatible)
 
 /// Tune the debug info for this debugger.
-ENUM_DEBUGOPT(DebuggerTuning, llvm::DebuggerKind, 3,
-              llvm::DebuggerKind::Default, Affecting)
+ENUM_DEBUGOPT(DebuggerTuning, DebuggerKind, 3,
+              DebuggerKind::Default, Compatible)
 
 /// Dwarf version. Version zero indicates to LLVM that no DWARF should be
 /// emitted.
-VALUE_DEBUGOPT(DwarfVersion, 3, 0, Affecting)
+VALUE_DEBUGOPT(DwarfVersion, 3, 0, Compatible)
 
 /// Whether we should emit CodeView debug information. It's possible to emit
 /// CodeView and DWARF into the same object.
-DEBUGOPT(EmitCodeView, 1, 0, Affecting)
+DEBUGOPT(EmitCodeView, 1, 0, Compatible)
 
 /// Whether to emit the .debug$H section containing hashes of CodeView types.
-DEBUGOPT(CodeViewGHash, 1, 0, Affecting)
+DEBUGOPT(CodeViewGHash, 1, 0, Compatible)
 
 /// Whether to emit the compiler path and command line into the CodeView debug information.
-DEBUGOPT(CodeViewCommandLine, 1, 0, Affecting)
+DEBUGOPT(CodeViewCommandLine, 1, 0, Compatible)
 
 /// Whether emit extra debug info for sample pgo profile collection.
-DEBUGOPT(DebugInfoForProfiling, 1, 0, Affecting)
+DEBUGOPT(DebugInfoForProfiling, 1, 0, Compatible)
 
 /// Whether to emit DW_TAG_template_alias for template aliases.
-DEBUGOPT(DebugTemplateAlias, 1, 0, Affecting)
+DEBUGOPT(DebugTemplateAlias, 1, 0, Compatible)
 
 /// Whether to emit .debug_gnu_pubnames section instead of .debug_pubnames.
-DEBUGOPT(DebugNameTable, 2, 0, Affecting)
+DEBUGOPT(DebugNameTable, 2, 0, Compatible)
 
 /// Whether to use DWARF base address specifiers in .debug_ranges.
-DEBUGOPT(DebugRangesBaseAddress, 1, 0, Affecting)
+DEBUGOPT(DebugRangesBaseAddress, 1, 0, Compatible)
 
 /// Whether to embed source in DWARF debug line section.
-DEBUGOPT(EmbedSource, 1, 0, Affecting)
+DEBUGOPT(EmbedSource, 1, 0, Compatible)
 
 #undef DEBUGOPT
 #undef ENUM_DEBUGOPT
diff --git a/clang/include/clang/Basic/DeclNodes.td b/clang/include/clang/Basic/DeclNodes.td
index f1ebaf1db3fc0..8d6731b50f509 100644
--- a/clang/include/clang/Basic/DeclNodes.td
+++ b/clang/include/clang/Basic/DeclNodes.td
@@ -15,9 +15,10 @@ def PragmaComment : DeclNode<Decl>;
 def PragmaDetectMismatch : DeclNode<Decl>;
 def ExternCContext : DeclNode<Decl>, DeclContext;
 def Named : DeclNode<Decl, "named declarations", 1>;
-  def Namespace : DeclNode<Named, "namespaces">, DeclContext;
+  def NamespaceBase : DeclNode<Named, "namespace declarations", 1>;
+    def Namespace : DeclNode<NamespaceBase, "namespaces">, DeclContext;
+    def NamespaceAlias : DeclNode<NamespaceBase>;
   def UsingDirective : DeclNode<Named>;
-  def NamespaceAlias : DeclNode<Named>;
   def Label : DeclNode<Named, "labels">;
   def Type : DeclNode<Named, "types", 1>;
     def TypedefName : DeclNode<Type, "typedefs", 1>;
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index a67b9995d3b54..071a38f513911 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -174,10 +174,11 @@ def note_constexpr_heap_alloc_limit_exceeded : Note<
 def note_constexpr_this : Note<
   "%select{|implicit }0use of 'this' pointer is only allowed within the "
   "evaluation of a call to a 'constexpr' member function">;
-def access_kind : TextSubstitution<
-  "%select{read of|read of|assignment to|increment of|decrement of|"
-  "member call on|dynamic_cast of|typeid applied to|construction of|"
-  "destruction of|read of}0">;
+def access_kind
+    : TextSubstitution<
+          "%select{read of|read of|assignment to|increment of|decrement of|"
+          "member call on|dynamic_cast of|typeid applied to|construction of|"
+          "destruction of|read of|read of}0">;
 def access_kind_subobject : TextSubstitution<
   "%select{read of|read of|assignment to|increment of|decrement of|"
   "member call on|dynamic_cast of|typeid applied to|"
@@ -222,6 +223,9 @@ def note_constexpr_ltor_incomplete_type : Note<
 def note_constexpr_access_null : Note<
   "%sub{access_kind}0 "
   "dereferenced null pointer is not allowed in a constant expression">;
+def note_constexpr_dereferencing_null
+    : Note<"dereferencing a null pointer is not allowed in a constant "
+           "expression">;
 def note_constexpr_access_past_end : Note<
   "%sub{access_kind}0 dereferenced one-past-the-end pointer "
   "is not allowed in a constant expression">;
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 9a7a308600763..ccb18aa37447e 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -952,9 +952,11 @@ def UninitializedMaybe : DiagGroup<"conditional-uninitialized">;
 def UninitializedSometimes : DiagGroup<"sometimes-uninitialized">;
 def UninitializedStaticSelfInit : DiagGroup<"static-self-init">;
 def UninitializedConstReference : DiagGroup<"uninitialized-const-reference">;
+def UninitializedConstPointer : DiagGroup<"uninitialized-const-pointer">;
 def Uninitialized  : DiagGroup<"uninitialized", [UninitializedSometimes,
                                                  UninitializedStaticSelfInit,
-                                                 UninitializedConstReference]>;
+                                                 UninitializedConstReference,
+                                                 UninitializedConstPointer]>;
 def IgnoredPragmaIntrinsic : DiagGroup<"ignored-pragma-intrinsic">;
 // #pragma optimize is often used to avoid to work around MSVC codegen bugs or
 // to disable inlining. It's not completely clear what alternative to suggest
@@ -1291,6 +1293,7 @@ def ThreadSafetyBeta : DiagGroup<"thread-safety-beta">;
 // Warnings and notes related to the function effects system which underlies
 // the nonblocking and nonallocating attributes.
 def FunctionEffects : DiagGroup<"function-effects">;
+def FunctionEffectRedeclarations : DiagGroup<"function-effect-redeclarations">;
 def PerfConstraintImpliesNoexcept : DiagGroup<"perf-constraint-implies-noexcept">;
 
 // Uniqueness Analysis warnings
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 577adc30ba2fa..b2ea65ae111be 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2548,6 +2548,10 @@ def warn_uninit_const_reference : Warning<
   "variable %0 is uninitialized when passed as a const reference argument "
   "here">, InGroup<UninitializedConstReference>, DefaultIgnore;
 
+def warn_uninit_const_pointer : Warning<
+  "variable %0 is uninitialized when passed as a const pointer argument here">,
+  InGroup<UninitializedConstPointer>, DefaultIgnore;
+
 def warn_unsequenced_mod_mod : Warning<
   "multiple unsequenced modifications to %0">, InGroup<Unsequenced>;
 def warn_unsequenced_mod_use : Warning<
@@ -10081,11 +10085,8 @@ def err_destroying_operator_delete_not_usual : Error<
 def err_type_aware_destroying_operator_delete : Error<
   "destroying delete is not permitted to be type aware">;
 
-def ext_cxx26_type_aware_allocators : ExtWarn<
-  "type aware allocators are a C++2c extension">, InGroup<CXX26>;
-def warn_cxx26_type_aware_allocators : Warning<
-  "type aware allocators are incompatible with C++ standards before C++2c">,
-  DefaultIgnore, InGroup<CXXPre26Compat>;
+def warn_ext_type_aware_allocators : ExtWarn<
+  "type aware allocators are a Clang extension">, InGroup<DiagGroup<"ext-cxx-type-aware-allocators">>;
 def err_type_aware_allocator_missing_matching_operator : Error<
   "declaration of type aware %0 in %1 must have matching type aware %2"
 >;
@@ -11529,17 +11530,28 @@ def note_in_evaluating_default_argument : Note<
 def warn_invalid_add_func_effects : Warning<
   "attribute '%0' should not be added via type conversion">,
   InGroup<FunctionEffects>, DefaultIgnore;
-def warn_mismatched_func_effect_override : Warning<
-  "attribute '%0' on overriding function does not match base declaration">,
-  InGroup<FunctionEffects>, DefaultIgnore;
-def warn_mismatched_func_effect_redeclaration : Warning<
-  "attribute '%0' on function does not match previous declaration">,
-  InGroup<FunctionEffects>, DefaultIgnore;
+def warn_conflicting_func_effect_override
+    : Warning<"attribute '%0' on overriding function conflicts with base "
+              "declaration">,
+      InGroup<FunctionEffects>,
+      DefaultIgnore;
 def warn_conflicting_func_effects : Warning<
   "effects conflict when merging declarations; kept '%0', discarded '%1'">,
   InGroup<FunctionEffects>, DefaultIgnore;
 def err_func_with_effects_no_prototype : Error<
   "'%0' function must have a prototype">;
+// These are more pedantic: in redeclarations and virtual method overrides,
+// the effect attribute(s) should be restated.
+def warn_mismatched_func_effect_override
+    : Warning<"overriding function is missing '%0' attribute from base "
+              "declaration">,
+      InGroup<FunctionEffectRedeclarations>,
+      DefaultIgnore;
+def warn_mismatched_func_effect_redeclaration
+    : Warning<
+          "redeclaration is missing '%0' attribute from previous declaration">,
+      InGroup<FunctionEffectRedeclarations>,
+      DefaultIgnore;
 
 } // end of sema category
 
@@ -12356,7 +12368,7 @@ def err_export_using_internal : Error<
   "using declaration referring to %1 with %select{internal|module|unknown}0 "
   "linkage cannot be exported">;
 def err_export_not_in_module_interface : Error<
-  "export declaration can only be used within a module purview">;
+  "export declaration can only be used within a module interface">;
 def err_export_inline_not_defined : Error<
   "inline function not defined%select{| before the private module fragment}0">;
 def err_export_partition_impl : Error<
@@ -13477,6 +13489,12 @@ def err_acc_invalid_default_type
 def err_acc_device_type_multiple_archs
     : Error<"OpenACC 'device_type' clause on a 'set' construct only permits "
             "one architecture">;
+def warn_acc_var_referenced_lacks_op
+    : Warning<"variable of type %0 referenced in OpenACC '%1' clause does not "
+              "have a %enum_select<AccVarReferencedReason>{%DefCtor{default "
+              "constructor}|%Dtor{destructor}}2; reference has no effect">,
+      InGroup<DiagGroup<"openacc-var-lacks-operation">>,
+      DefaultError;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git a/clang/include/clang/Basic/DiagnosticSerializationKinds.td b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
index 584c8d62280bf..fc3585ffe8549 100644
--- a/clang/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -41,6 +41,10 @@ def err_ast_file_langopt_mismatch : Error<"%0 was %select{disabled|enabled}1 in
     "precompiled file '%3' but is currently %select{disabled|enabled}2">;
 def err_ast_file_langopt_value_mismatch : Error<
   "%0 differs in precompiled file '%1' vs. current file">;
+def err_ast_file_codegenopt_mismatch : Error<"%0 was %select{disabled|enabled}1 in "
+  "precompiled file '%3' but is currently %select{disabled|enabled}2">;
+def err_ast_file_codegenopt_value_mismatch
+    : Error<"%0 differs in precompiled file '%1' vs. current file">;
 def err_ast_file_diagopt_mismatch : Error<"%0 is currently enabled, but was not in "
   "the precompiled file '%1'">;
 def err_ast_file_modulecache_mismatch : Error<"precompiled file '%2' was compiled with module cache "
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 579461b4f710f..72f23614aef11 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -160,6 +160,12 @@ FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
 FEATURE(ptrauth_init_fini_address_discrimination, LangOpts.PointerAuthInitFiniAddressDiscrimination)
 FEATURE(ptrauth_elf_got, LangOpts.PointerAuthELFGOT)
+
+FEATURE(ptrauth_objc_isa, LangOpts.PointerAuthObjcIsa)
+FEATURE(ptrauth_objc_interface_sel, LangOpts.PointerAuthObjcInterfaceSel)
+FEATURE(ptrauth_objc_signable_class, true)
+FEATURE(ptrauth_objc_method_list_pointer, LangOpts.PointerAuthCalls)
+
 EXTENSION(swiftcc,
   PP.getTargetInfo().checkCallingConvention(CC_Swift) ==
   clang::TargetInfo::CCCR_OK)
@@ -366,5 +372,8 @@ FEATURE(clang_atomic_attributes, true)
 FEATURE(cuda_noinline_keyword, LangOpts.CUDA)
 EXTENSION(cuda_implicit_host_device_templates, LangOpts.CUDA && LangOpts.OffloadImplicitHostDeviceTemplates)
 
+// C++2d type-aware allocators
+EXTENSION(cxx_type_aware_allocators, true)
+
 #undef EXTENSION
 #undef FEATURE
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index e43238ba683f2..6ac8d496f1494 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -98,8 +98,6 @@ LANGOPT(Exceptions        , 1, 0, NotCompatible, "exception handling")
 LANGOPT(ObjCExceptions    , 1, 0, NotCompatible, "Objective-C exceptions")
 LANGOPT(CXXExceptions     , 1, 0, NotCompatible, "C++ exceptions")
 LANGOPT(EHAsynch          , 1, 0, NotCompatible, "C/C++ EH Asynch exceptions")
-ENUM_LANGOPT(ExceptionHandling, ExceptionHandlingKind, 3,
-             ExceptionHandlingKind::None,  NotCompatible, "exception handling")
 LANGOPT(IgnoreExceptions  , 1, 0, NotCompatible, "ignore exceptions")
 LANGOPT(ExternCNoUnwind   , 1, 0, NotCompatible, "Assume extern C functions don't unwind")
 LANGOPT(AssumeNothrowExceptionDtor , 1, 0, NotCompatible, "Assume exception object's destructor is nothrow")
@@ -133,6 +131,11 @@ LANGOPT(PointerAuthInitFiniAddressDiscrimination, 1, 0, NotCompatible,
 LANGOPT(PointerAuthELFGOT, 1, 0, NotCompatible, "authenticate pointers from GOT")
 LANGOPT(AArch64JumpTableHardening, 1, 0, NotCompatible, "use hardened lowering for jump-table dispatch")
 
+LANGOPT(PointerAuthObjcIsa, 1, 0, NotCompatible, "authentication of isa and super pointers in ObjC instances")
+LANGOPT(PointerAuthObjcInterfaceSel, 1, 0, NotCompatible, "authentication of SEL fields of ObjC interfaces")
+LANGOPT(PointerAuthObjcInterfaceSelKey, 16, 0, NotCompatible, "authentication key for SEL fields of ObjC interfaces")
+LANGOPT(PointerAuthObjcClassROPointers, 1, 0, Benign, "class_ro_t pointer authentication")
+
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, NotCompatible, "'[[]]' attributes extension for all language standard modes")
 LANGOPT(ExperimentalLateParseAttributes, 1, 0, NotCompatible, "experimental late parsing of attributes")
 
@@ -161,8 +164,6 @@ LANGOPT(ModulesValidateTextualHeaderIncludes, 1, 1, Compatible, "validation of t
 LANGOPT(ModulesErrorRecovery, 1, 1, Benign, "automatically importing modules as needed when performing error recovery")
 LANGOPT(ImplicitModules, 1, 1, Benign, "building modules that are not specified via -fmodule-file")
 LANGOPT(ModulesLocalVisibility, 1, 0, Compatible, "local submodule visibility")
-LANGOPT(Optimize          , 1, 0, Compatible, "__OPTIMIZE__ predefined macro")
-LANGOPT(OptimizeSize      , 1, 0, Compatible, "__OPTIMIZE_SIZE__ predefined macro")
 LANGOPT(Static            , 1, 0, Compatible, "__STATIC__ predefined macro (as opposed to __DYNAMIC__)")
 VALUE_LANGOPT(PackStruct  , 32, 0, NotCompatible,
               "default struct packing maximum alignment")
@@ -179,7 +180,6 @@ VALUE_LANGOPT(PIE                    , 1, 0, Compatible, "is pie")
 LANGOPT(ROPI                         , 1, 0, NotCompatible, "Read-only position independence")
 LANGOPT(RWPI                         , 1, 0, NotCompatible, "Read-write position independence")
 LANGOPT(GNUInline                    , 1, 0, Compatible, "GNU inline semantics")
-LANGOPT(NoInlineDefine               , 1, 0, Compatible, "__NO_INLINE__ predefined macro")
 LANGOPT(Deprecated                   , 1, 0, Compatible, "__DEPRECATED predefined macro")
 LANGOPT(FastMath                     , 1, 0, Compatible, "fast FP math optimizations, and __FAST_MATH__ predefined macro")
 LANGOPT(UnsafeFPMath                 , 1, 0, Compatible, "Unsafe Floating Point Math")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 4c642c9e10c91..0407897359b5e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -337,9 +337,6 @@ class LangOptionsBase {
 
   enum ExcessPrecisionKind { FPP_Standard, FPP_Fast, FPP_None };
 
-  /// Possible exception handling behavior.
-  enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
-
   enum class LaxVectorConversionKind {
     /// Permit no implicit vector bitcasts.
     None,
@@ -636,11 +633,6 @@ class LangOptions : public LangOptionsBase {
   // received as a result of a standard operator new (-fcheck-new)
   bool CheckNew = false;
 
-  // In OpenACC mode, contains a user provided override for the _OPENACC macro.
-  // This exists so that we can override the macro value and test our incomplete
-  // implementation on real-world examples.
-  std::string OpenACCMacroOverride;
-
   /// The HLSL root signature version for dxil.
   llvm::dxbc::RootSignatureVersion HLSLRootSigVer =
       llvm::dxbc::RootSignatureVersion::V1_1;
@@ -788,22 +780,6 @@ class LangOptions : public LangOptionsBase {
     return getSignReturnAddressScope() == SignReturnAddressScopeKind::All;
   }
 
-  bool hasSjLjExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::SjLj;
-  }
-
-  bool hasSEHExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::WinEH;
-  }
-
-  bool hasDWARFExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::DwarfCFI;
-  }
-
-  bool hasWasmExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::Wasm;
-  }
-
   bool isSYCL() const { return SYCLIsDevice || SYCLIsHost; }
 
   bool hasDefaultVisibilityExportMapping() const {
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
index a3a3e50bcde5d..fb6dddf3ae9ce 100644
--- a/clang/include/clang/Basic/PointerAuthOptions.h
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -27,6 +27,26 @@ namespace clang {
 /// .fini_array. The value is ptrauth_string_discriminator("init_fini")
 constexpr uint16_t InitFiniPointerConstantDiscriminator = 0xD9D4;
 
+/// Constant discriminator to be used with method list pointers. The value is
+/// ptrauth_string_discriminator("method_list_t")
+constexpr uint16_t MethodListPointerConstantDiscriminator = 0xC310;
+
+/// Constant discriminator to be used with objective-c isa pointers. The value
+/// is ptrauth_string_discriminator("isa")
+constexpr uint16_t IsaPointerConstantDiscriminator = 0x6AE1;
+
+/// Constant discriminator to be used with objective-c superclass pointers.
+/// The value is ptrauth_string_discriminator("objc_class:superclass")
+constexpr uint16_t SuperPointerConstantDiscriminator = 0xB5AB;
+
+/// Constant discriminator to be used with objective-c sel pointers. The value
+/// is ptrauth_string_discriminator("sel")
+constexpr uint16_t SelPointerConstantDiscriminator = 0x57c2;
+
+/// Constant discriminator to be used with objective-c class_ro_t pointers.
+/// The value is ptrauth_string_discriminator("class_data_bits")
+constexpr uint16_t ClassROConstantDiscriminator = 0x61F8;
+
 constexpr unsigned PointerAuthKeyNone = -1;
 
 /// Constant discriminator for std::type_info vtable pointers: 0xB1EA/45546
@@ -202,6 +222,21 @@ struct PointerAuthOptions {
 
   /// The ABI for function addresses in .init_array and .fini_array
   PointerAuthSchema InitFiniPointers;
+
+  /// The ABI for Objective-C method lists.
+  PointerAuthSchema ObjCMethodListFunctionPointers;
+
+  /// The ABI for a reference to an Objective-C method list in _class_ro_t.
+  PointerAuthSchema ObjCMethodListPointer;
+
+  /// The ABI for Objective-C isa pointers.
+  PointerAuthSchema ObjCIsaPointers;
+
+  /// The ABI for Objective-C superclass pointers.
+  PointerAuthSchema ObjCSuperPointers;
+
+  /// The ABI for Objective-C class_ro_t pointers.
+  PointerAuthSchema ObjCClassROPointers;
 };
 
 } // end namespace clang
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index f05dcae0a318c..8b66c73474704 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -337,10 +337,6 @@ class TargetInfo : public TransferrableTargetInfo,
     /// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055a/IHI0055A_aapcs64.pdf
     AArch64ABIBuiltinVaList,
 
-    /// __builtin_va_list as defined by the PNaCl ABI:
-    /// http://www.chromium.org/nativeclient/pnacl/bitcode-abi#TOC-Machine-Types
-    PNaClABIBuiltinVaList,
-
     /// __builtin_va_list as defined by the Power ABI:
     /// https://www.power.org
     ///        /resources/downloads/Power-Arch-32-bit-ABI-supp-1.0-Embedded.pdf
@@ -1702,8 +1698,11 @@ class TargetInfo : public TransferrableTargetInfo,
   /// Controls if __arithmetic_fence is supported in the targeted backend.
   virtual bool checkArithmeticFenceSupported() const { return false; }
 
-  /// Gets the default calling convention for the given target and
-  /// declaration context.
+  /// Gets the default calling convention for the given target.
+  ///
+  /// This function does not take into account any user options to override the
+  /// default calling convention. For that, see
+  /// ASTContext::getDefaultCallingConvention().
   virtual CallingConv getDefaultCallingConv() const {
     // Not all targets will specify an explicit calling convention that we can
     // express.  This will always do the right thing, even though it's not
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 277c278fd38b7..25baf278bba38 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -129,6 +129,22 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   cir::BoolAttr getTrueAttr() { return getCIRBoolAttr(true); }
   cir::BoolAttr getFalseAttr() { return getCIRBoolAttr(false); }
 
+  mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real,
+                                  mlir::Value imag) {
+    auto resultComplexTy = cir::ComplexType::get(real.getType());
+    return create<cir::ComplexCreateOp>(loc, resultComplexTy, real, imag);
+  }
+
+  mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) {
+    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
+    return create<cir::ComplexRealOp>(loc, operandTy.getElementType(), operand);
+  }
+
+  mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
+    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
+    return create<cir::ComplexImagOp>(loc, operandTy.getElementType(), operand);
+  }
+
   mlir::Value createNot(mlir::Value value) {
     return create<cir::UnaryOp>(value.getLoc(), value.getType(),
                                 cir::UnaryOpKind::Not, value);
@@ -169,6 +185,11 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return create<cir::ContinueOp>(loc);
   }
 
+  mlir::Value createUnaryOp(mlir::Location loc, cir::UnaryOpKind kind,
+                            mlir::Value operand) {
+    return create<cir::UnaryOp>(loc, kind, operand);
+  }
+
   mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) {
     return cir::ConstPtrAttr::get(type, getI64IntegerAttr(value));
   }
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 785478abb0778..29d8aea8d08e7 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -10,14 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
-#define LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
+#ifndef CLANG_CIR_DIALECT_IR_CIRATTRS_TD
+#define CLANG_CIR_DIALECT_IR_CIRATTRS_TD
 
 include "mlir/IR/BuiltinAttributeInterfaces.td"
-include "mlir/IR/EnumAttr.td"
 
-include "clang/CIR/Dialect/IR/CIRDialect.td"
 include "clang/CIR/Dialect/IR/CIRAttrConstraints.td"
+include "clang/CIR/Dialect/IR/CIRDialect.td"
+include "clang/CIR/Dialect/IR/CIREnumAttr.td"
 
 //===----------------------------------------------------------------------===//
 // CIR Attrs
@@ -42,21 +42,6 @@ class CIR_TypedAttr<string name, string attrMnemonic, list<Trait> traits = []>
   let assemblyFormat = [{}];
 }
 
-class CIR_I32EnumAttr<string name, string summary, list<I32EnumAttrCase> cases>
-    : I32EnumAttr<name, summary, cases> {
-  let cppNamespace = "::cir";
-}
-
-class CIR_I64EnumAttr<string name, string summary, list<I64EnumAttrCase> cases>
-    : I64EnumAttr<name, summary, cases> {
-  let cppNamespace = "::cir";
-}
-
-class CIR_EnumAttr<EnumAttrInfo info, string name = "", list<Trait> traits = []>
-    : EnumAttr<CIR_Dialect, info, name, traits> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 class CIRUnitAttr<string name, string attrMnemonic, list<Trait> traits = []>
     : CIR_Attr<name, attrMnemonic, traits> {
   let returnType = "bool";
@@ -516,4 +501,4 @@ def BitfieldInfoAttr : CIR_Attr<"BitfieldInfo", "bitfield_info"> {
 }
 
 
-#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRATTRS_TD
+#endif // CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
index fdd56ac1f218f..ddcb988f9ea84 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT_H
-#define LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT_H
+#ifndef CLANG_CIR_DIALECT_IR_CIRDIALECT_H
+#define CLANG_CIR_DIALECT_IR_CIRDIALECT_H
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -49,4 +49,4 @@ void buildTerminatedBody(mlir::OpBuilder &builder, mlir::Location loc);
 #define GET_OP_CLASSES
 #include "clang/CIR/Dialect/IR/CIROps.h.inc"
 
-#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT_H
+#endif // CLANG_CIR_DIALECT_IR_CIRDIALECT_H
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index fa73e5bf41f23..3fdbf65573b36 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT
-#define LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT
+#ifndef CLANG_CIR_DIALECT_IR_CIRDIALECT_TD
+#define CLANG_CIR_DIALECT_IR_CIRDIALECT_TD
 
 include "mlir/IR/OpBase.td"
 
@@ -56,4 +56,4 @@ def CIR_Dialect : Dialect {
   }];
 }
 
-#endif // LLVM_CLANG_CIR_DIALECT_IR_CIRDIALECT
+#endif // CLANG_CIR_DIALECT_IR_CIRDIALECT_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIREnumAttr.td b/clang/include/clang/CIR/Dialect/IR/CIREnumAttr.td
new file mode 100644
index 0000000000000..98b8a31d2a18a
--- /dev/null
+++ b/clang/include/clang/CIR/Dialect/IR/CIREnumAttr.td
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CIR dialect enum base classes
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_CIR_DIALECT_IR_CIRENUMATTR_TD
+#define CLANG_CIR_DIALECT_IR_CIRENUMATTR_TD
+
+include "mlir/IR/EnumAttr.td"
+
+class CIR_I32EnumAttr<string name, string summary, list<I32EnumAttrCase> cases>
+    : I32EnumAttr<name, summary, cases> {
+  let cppNamespace = "::cir";
+}
+
+class CIR_I64EnumAttr<string name, string summary, list<I64EnumAttrCase> cases>
+    : I64EnumAttr<name, summary, cases> {
+  let cppNamespace = "::cir";
+}
+
+class CIR_EnumAttr<EnumAttrInfo info, string name = "", list<Trait> traits = []>
+    : EnumAttr<CIR_Dialect, info, name, traits> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+class CIR_DefaultValuedEnumParameter<EnumAttrInfo info, string value = "">
+    : EnumParameter<info> {
+  let defaultValue = value;
+}
+
+#endif // CLANG_CIR_DIALECT_IR_CIRENUMATTR_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 676ff76dff661..d19cd83d78b40 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -159,9 +159,9 @@ def CIR_CastKind : CIR_I32EnumAttr<"CastKind", "cast kind", [
   I32EnumAttrCase<"bool_to_float", 1000>,
 ]>;
 
-def CastOp : CIR_Op<"cast",
-             [Pure,
-              DeclareOpInterfaceMethods<PromotableOpInterface>]> {
+def CIR_CastOp : CIR_Op<"cast", [
+  Pure, DeclareOpInterfaceMethods<PromotableOpInterface>
+]> {
   // FIXME: not all conversions are free of side effects.
   let summary = "Conversion between values of different types";
   let description = [{
@@ -225,8 +225,9 @@ def CastOp : CIR_Op<"cast",
 // PtrStrideOp
 //===----------------------------------------------------------------------===//
 
-def PtrStrideOp : CIR_Op<"ptr_stride",
-                         [Pure, AllTypesMatch<["base", "result"]>]> {
+def CIR_PtrStrideOp : CIR_Op<"ptr_stride", [
+  Pure, AllTypesMatch<["base", "result"]>
+]> {
   let summary = "Pointer access with stride";
   let description = [{
     Given a base pointer as first operand, provides a new pointer after applying
@@ -262,8 +263,9 @@ def PtrStrideOp : CIR_Op<"ptr_stride",
 // ConstantOp
 //===----------------------------------------------------------------------===//
 
-def ConstantOp : CIR_Op<"const",
-                        [ConstantLike, Pure, AllTypesMatch<["value", "res"]>]> {
+def CIR_ConstantOp : CIR_Op<"const", [
+  ConstantLike, Pure, AllTypesMatch<["value", "res"]>
+]> {
   let summary = "Defines a CIR constant";
   let description = [{
     The `cir.const` operation turns a literal into an SSA value. The data is
@@ -298,22 +300,23 @@ def ConstantOp : CIR_Op<"const",
 // AllocaOp
 //===----------------------------------------------------------------------===//
 
-class AllocaTypesMatchWith<string summary, string lhsArg, string rhsArg,
-                     string transform, string comparator = "std::equal_to<>()">
-  : PredOpTrait<summary, CPred<
-      comparator # "(" #
+class CIR_AllocaTypesMatchWith<
+  string summary, string lhsArg, string rhsArg, string transform,
+  string comparator = "std::equal_to<>()"
+> : PredOpTrait<summary, CPred<comparator # "(" #
       !subst("$_self", "$" # lhsArg # ".getType()", transform) #
-             ", $" # rhsArg # ")">> {
+             ", $" # rhsArg # ")">
+> {
   string lhs = lhsArg;
   string rhs = rhsArg;
   string transformer = transform;
 }
 
-def AllocaOp : CIR_Op<"alloca", [
-  AllocaTypesMatchWith<"'allocaType' matches pointee type of 'addr'",
-                 "addr", "allocaType",
-                 "cast<PointerType>($_self).getPointee()">,
-                 DeclareOpInterfaceMethods<PromotableAllocationOpInterface>]> {
+def CIR_AllocaOp : CIR_Op<"alloca", [
+  CIR_AllocaTypesMatchWith<"'allocaType' matches pointee type of 'addr'",
+    "addr", "allocaType", "mlir::cast<cir::PointerType>($_self).getPointee()">,
+  DeclareOpInterfaceMethods<PromotableAllocationOpInterface>
+]> {
   let summary = "Defines a scope-local variable";
   let description = [{
     The `cir.alloca` operation defines a scope-local variable.
@@ -374,12 +377,11 @@ def AllocaOp : CIR_Op<"alloca", [
 // LoadOp
 //===----------------------------------------------------------------------===//
 
-def LoadOp : CIR_Op<"load", [
+def CIR_LoadOp : CIR_Op<"load", [
   TypesMatchWith<"type of 'result' matches pointee type of 'addr'",
-                 "addr", "result",
-                 "cast<PointerType>($_self).getPointee()">,
-                 DeclareOpInterfaceMethods<PromotableMemOpInterface>]> {
-
+    "addr", "result", "mlir::cast<cir::PointerType>($_self).getPointee()">,
+  DeclareOpInterfaceMethods<PromotableMemOpInterface>
+]> {
   let summary = "Load value from memory adddress";
   let description = [{
     `cir.load` reads a value (lvalue to rvalue conversion) given an address
@@ -420,12 +422,11 @@ def LoadOp : CIR_Op<"load", [
 // StoreOp
 //===----------------------------------------------------------------------===//
 
-def StoreOp : CIR_Op<"store", [
+def CIR_StoreOp : CIR_Op<"store", [
   TypesMatchWith<"type of 'value' matches pointee type of 'addr'",
-                 "addr", "value",
-                 "cast<PointerType>($_self).getPointee()">,
-                 DeclareOpInterfaceMethods<PromotableMemOpInterface>]> {
-
+    "addr", "value", "mlir::cast<cir::PointerType>($_self).getPointee()">,
+  DeclareOpInterfaceMethods<PromotableMemOpInterface>
+]> {
   let summary = "Store value to memory address";
   let description = [{
     `cir.store` stores a value (first operand) to the memory address specified
@@ -461,10 +462,14 @@ def StoreOp : CIR_Op<"store", [
 // ReturnOp
 //===----------------------------------------------------------------------===//
 
-def ReturnOp : CIR_Op<"return", [ParentOneOf<["FuncOp", "ScopeOp", "IfOp",
-                                              "SwitchOp", "DoWhileOp","WhileOp",
-                                              "ForOp", "CaseOp"]>,
-                                 Terminator]> {
+defvar CIR_ReturnableScopes = [
+  "FuncOp", "ScopeOp", "IfOp", "SwitchOp", "CaseOp",
+  "DoWhileOp", "WhileOp", "ForOp"
+];
+
+def CIR_ReturnOp : CIR_Op<"return", [
+  ParentOneOf<CIR_ReturnableScopes>, Terminator
+]> {
   let summary = "Return from function";
   let description = [{
     The "return" operation represents a return operation within a function.
@@ -504,10 +509,10 @@ def ReturnOp : CIR_Op<"return", [ParentOneOf<["FuncOp", "ScopeOp", "IfOp",
 // IfOp
 //===----------------------------------------------------------------------===//
 
-def IfOp : CIR_Op<"if",
-     [DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-     RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments]>{
-
+def CIR_IfOp : CIR_Op<"if", [
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments
+]> {
   let summary = "the if-then-else operation";
   let description = [{
     The `cir.if` operation represents an if-then-else construct for
@@ -555,10 +560,11 @@ def IfOp : CIR_Op<"if",
 // ConditionOp
 //===----------------------------------------------------------------------===//
 
-def ConditionOp : CIR_Op<"condition", [
+def CIR_ConditionOp : CIR_Op<"condition", [
   Terminator,
-  DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface,
-                            ["getSuccessorRegions"]>
+  DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface, [
+    "getSuccessorRegions"
+  ]>
 ]> {
   let summary = "Loop continuation condition.";
   let description = [{
@@ -600,10 +606,14 @@ def ConditionOp : CIR_Op<"condition", [
 // YieldOp
 //===----------------------------------------------------------------------===//
 
-def YieldOp : CIR_Op<"yield", [ReturnLike, Terminator,
-                               ParentOneOf<["CaseOp", "DoWhileOp", "ForOp",
-                                            "IfOp", "ScopeOp", "SwitchOp",
-                                            "TernaryOp", "WhileOp"]>]> {
+defvar CIR_YieldableScopes = [
+  "CaseOp", "DoWhileOp", "ForOp", "IfOp", "ScopeOp", "SwitchOp",
+  "TernaryOp", "WhileOp"
+];
+
+def CIR_YieldOp : CIR_Op<"yield", [
+  ReturnLike, Terminator, ParentOneOf<CIR_YieldableScopes>
+]> {
   let summary = "Represents the default branching behaviour of a region";
   let description = [{
     The `cir.yield` operation terminates regions on different CIR operations,
@@ -663,7 +673,7 @@ def YieldOp : CIR_Op<"yield", [ReturnLike, Terminator,
 // BreakOp
 //===----------------------------------------------------------------------===//
 
-def BreakOp : CIR_Op<"break", [Terminator]> {
+def CIR_BreakOp : CIR_Op<"break", [Terminator]> {
   let summary = "C/C++ `break` statement equivalent";
   let description = [{
     The `cir.break` operation is used to cease the execution of the current loop
@@ -678,7 +688,7 @@ def BreakOp : CIR_Op<"break", [Terminator]> {
 // ContinueOp
 //===----------------------------------------------------------------------===//
 
-def ContinueOp : CIR_Op<"continue", [Terminator]> {
+def CIR_ContinueOp : CIR_Op<"continue", [Terminator]> {
   let summary = "C/C++ `continue` statement equivalent";
   let description = [{
     The `cir.continue` operation is used to end execution of the current
@@ -693,9 +703,10 @@ def ContinueOp : CIR_Op<"continue", [Terminator]> {
 // ScopeOp
 //===----------------------------------------------------------------------===//
 
-def ScopeOp : CIR_Op<"scope", [
-       DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-       RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments]> {
+def CIR_ScopeOp : CIR_Op<"scope", [
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments
+]> {
   let summary = "Represents a C/C++ scope";
   let description = [{
     `cir.scope` contains one region and defines a strict "scope" for all new
@@ -757,9 +768,10 @@ def CIR_CaseOpKind : CIR_I32EnumAttr<"CaseOpKind", "case kind", [
   I32EnumAttrCase<"Range", 3, "range">
 ]>;
 
-def CaseOp : CIR_Op<"case", [
-       DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-       RecursivelySpeculatable, AutomaticAllocationScope]> {
+def CIR_CaseOp : CIR_Op<"case", [
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  RecursivelySpeculatable, AutomaticAllocationScope
+]> {
   let summary = "Case operation";
   let description = [{
     The `cir.case` operation represents a case within a C/C++ switch.
@@ -791,10 +803,11 @@ def CaseOp : CIR_Op<"case", [
   ];
 }
 
-def SwitchOp : CIR_Op<"switch",
-      [SameVariadicOperandSize,
-       DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-       RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments]> {
+def CIR_SwitchOp : CIR_Op<"switch", [
+  SameVariadicOperandSize,
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments
+]> {
   let summary = "Switch operation";
   let description = [{
     The `cir.switch` operation represents C/C++ switch functionality for
@@ -959,8 +972,10 @@ def SwitchOp : CIR_Op<"switch",
 // SwitchFlatOp
 //===----------------------------------------------------------------------===//
 
-def SwitchFlatOp : CIR_Op<"switch.flat", [AttrSizedOperandSegments,
-                                          Terminator]> {
+def CIR_SwitchFlatOp : CIR_Op<"switch.flat", [
+  AttrSizedOperandSegments, Terminator
+]> {
+  let summary = "A flattened version of cir.switch";
 
   let description = [{
     The `cir.switch.flat` operation is a region-less and simplified
@@ -1005,9 +1020,10 @@ def SwitchFlatOp : CIR_Op<"switch.flat", [AttrSizedOperandSegments,
 // BrOp
 //===----------------------------------------------------------------------===//
 
-def BrOp : CIR_Op<"br",
-      [DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
-     Pure, Terminator]> {
+def CIR_BrOp : CIR_Op<"br",[
+  DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
+  Pure, Terminator
+]> {
   let summary = "Unconditional branch";
   let description = [{
     The `cir.br` branches unconditionally to a block. Used to represent C/C++
@@ -1053,7 +1069,7 @@ def CIR_UnaryOpKind : CIR_I32EnumAttr<"UnaryOpKind", "unary operation kind", [
   I32EnumAttrCase<"Not",   4, "not">
 ]>;
 
-def UnaryOp : CIR_Op<"unary", [Pure, SameOperandsAndResultType]> {
+def CIR_UnaryOp : CIR_Op<"unary", [Pure, SameOperandsAndResultType]> {
   let summary = "Unary operations";
   let description = [{
     `cir.unary` performs the unary operation according to
@@ -1093,9 +1109,10 @@ def UnaryOp : CIR_Op<"unary", [Pure, SameOperandsAndResultType]> {
 // BrCondOp
 //===----------------------------------------------------------------------===//
 
-def BrCondOp : CIR_Op<"brcond",
-      [DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
-       Pure, Terminator, AttrSizedOperandSegments]> {
+def CIR_BrCondOp : CIR_Op<"brcond", [
+  DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
+  Pure, Terminator, AttrSizedOperandSegments
+]> {
   let summary = "Conditional branch";
   let description = [{
     The `cir.brcond %cond, ^bb0, ^bb1` branches to 'bb0' block in case
@@ -1140,9 +1157,8 @@ def BrCondOp : CIR_Op<"brcond",
 // Common loop op definitions
 //===----------------------------------------------------------------------===//
 
-class LoopOpBase<string mnemonic> : CIR_Op<mnemonic, [
-  LoopOpInterface,
-  NoRegionArguments,
+class CIR_LoopOpBase<string mnemonic> : CIR_Op<mnemonic, [
+  LoopOpInterface, NoRegionArguments
 ]> {
   let extraClassDefinition = [{
     void $cppClass::getSuccessorRegions(
@@ -1160,7 +1176,7 @@ class LoopOpBase<string mnemonic> : CIR_Op<mnemonic, [
 // While & DoWhileOp
 //===----------------------------------------------------------------------===//
 
-class WhileOpBase<string mnemonic> : LoopOpBase<mnemonic> {
+class CIR_WhileOpBase<string mnemonic> : CIR_LoopOpBase<mnemonic> {
   defvar isWhile = !eq(mnemonic, "while");
   let summary = "C/C++ " # !if(isWhile, "while", "do-while") # " loop";
   let builders = [
@@ -1180,7 +1196,7 @@ class WhileOpBase<string mnemonic> : LoopOpBase<mnemonic> {
   ];
 }
 
-def WhileOp : WhileOpBase<"while"> {
+def CIR_WhileOp : CIR_WhileOpBase<"while"> {
   let regions = (region SizedRegion<1>:$cond, MinSizedRegion<1>:$body);
   let assemblyFormat = "$cond `do` $body attr-dict";
 
@@ -1205,7 +1221,7 @@ def WhileOp : WhileOpBase<"while"> {
   }];
 }
 
-def DoWhileOp : WhileOpBase<"do"> {
+def CIR_DoWhileOp : CIR_WhileOpBase<"do"> {
   let regions = (region MinSizedRegion<1>:$body, SizedRegion<1>:$cond);
   let assemblyFormat = " $body `while` $cond attr-dict";
 
@@ -1235,7 +1251,7 @@ def DoWhileOp : WhileOpBase<"do"> {
 // ForOp
 //===----------------------------------------------------------------------===//
 
-def ForOp : LoopOpBase<"for"> {
+def CIR_ForOp : CIR_LoopOpBase<"for"> {
   let summary = "C/C++ for loop counterpart";
   let description = [{
     Represents a C/C++ for loop. It consists of three regions:
@@ -1311,8 +1327,7 @@ def CIR_CmpOpKind : CIR_I32EnumAttr<"CmpOpKind", "compare operation kind", [
   I32EnumAttrCase<"ne", 5>
 ]>;
 
-def CmpOp : CIR_Op<"cmp", [Pure, SameTypeOperands]> {
-
+def CIR_CmpOp : CIR_Op<"cmp", [Pure, SameTypeOperands]> {
   let summary = "Compare values two values and produce a boolean result";
   let description = [{
     `cir.cmp` compares two input operands of the same type and produces a
@@ -1355,9 +1370,9 @@ def CIR_BinOpKind : CIR_I32EnumAttr<
     I32EnumAttrCase<"Max", 8, "max">
 ]>;
 
-def BinOp : CIR_Op<"binop", [Pure,
-  SameTypeOperands, SameOperandsAndResultType]> {
-
+def CIR_BinOp : CIR_Op<"binop", [
+  Pure, SameTypeOperands, SameOperandsAndResultType
+]> {
   let summary = "Binary operations (arith and logic)";
   let description = [{
     cir.binop performs the binary operation according to
@@ -1411,7 +1426,7 @@ def BinOp : CIR_Op<"binop", [Pure,
 // ShiftOp
 //===----------------------------------------------------------------------===//
 
-def ShiftOp : CIR_Op<"shift", [Pure]> {
+def CIR_ShiftOp : CIR_Op<"shift", [Pure]> {
   let summary = "Shift";
   let description = [{
     The `cir.shift` operation performs a bitwise shift, either to the left or to
@@ -1452,8 +1467,9 @@ def ShiftOp : CIR_Op<"shift", [Pure]> {
 // SelectOp
 //===----------------------------------------------------------------------===//
 
-def SelectOp : CIR_Op<"select", [Pure,
-    AllTypesMatch<["true_value", "false_value", "result"]>]> {
+def CIR_SelectOp : CIR_Op<"select", [
+  Pure, AllTypesMatch<["true_value", "false_value", "result"]>
+]> {
   let summary = "Yield one of two values based on a boolean value";
   let description = [{
     The `cir.select` operation takes three operands. The first operand
@@ -1492,9 +1508,10 @@ def SelectOp : CIR_Op<"select", [Pure,
 // TernaryOp
 //===----------------------------------------------------------------------===//
 
-def TernaryOp : CIR_Op<"ternary",
-      [DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-       RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments]> {
+def CIR_TernaryOp : CIR_Op<"ternary", [
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments
+]> {
   let summary = "The `cond ? a : b` C/C++ ternary operation";
   let description = [{
     The `cir.ternary` operation represents C/C++ ternary, much like a `select`
@@ -1583,8 +1600,9 @@ def CIR_GlobalLinkageKind : CIR_I32EnumAttr<
 // properties of a global variable will be added over time as more of ClangIR
 // is upstreamed.
 
-def GlobalOp : CIR_Op<"global",
-                      [DeclareOpInterfaceMethods<CIRGlobalValueInterface>]> {
+def CIR_GlobalOp : CIR_Op<"global", [
+  DeclareOpInterfaceMethods<CIRGlobalValueInterface>
+]> {
   let summary = "Declare or define a global variable";
   let description = [{
     The `cir.global` operation declares or defines a named global variable.
@@ -1645,8 +1663,9 @@ def GlobalOp : CIR_Op<"global",
 // GetGlobalOp
 //===----------------------------------------------------------------------===//
 
-def GetGlobalOp : CIR_Op<"get_global",
-    [Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+def CIR_GetGlobalOp : CIR_Op<"get_global", [
+  Pure, DeclareOpInterfaceMethods<SymbolUserOpInterface>
+]> {
   let summary = "Get the address of a global variable";
   let description = [{
     The `cir.get_global` operation retrieves the address pointing to a
@@ -1673,7 +1692,7 @@ def GetGlobalOp : CIR_Op<"get_global",
 // SetBitfieldOp
 //===----------------------------------------------------------------------===//
 
-def SetBitfieldOp : CIR_Op<"set_bitfield"> {
+def CIR_SetBitfieldOp : CIR_Op<"set_bitfield"> {
   let summary = "Set the value of a bitfield member";
   let description = [{
     The `cir.set_bitfield` operation provides a store-like access to
@@ -1761,7 +1780,7 @@ def SetBitfieldOp : CIR_Op<"set_bitfield"> {
 // GetBitfieldOp
 //===----------------------------------------------------------------------===//
 
-def GetBitfieldOp : CIR_Op<"get_bitfield"> {
+def CIR_GetBitfieldOp : CIR_Op<"get_bitfield"> {
   let summary = "Get the information for a bitfield member";
   let description = [{
     The `cir.get_bitfield` operation provides a load-like access to
@@ -1843,7 +1862,7 @@ def GetBitfieldOp : CIR_Op<"get_bitfield"> {
 // GetMemberOp
 //===----------------------------------------------------------------------===//
 
-def GetMemberOp : CIR_Op<"get_member"> {
+def CIR_GetMemberOp : CIR_Op<"get_member"> {
   let summary = "Get the address of a member of a record";
   let description = [{
     The `cir.get_member` operation gets the address of a particular named
@@ -1905,7 +1924,7 @@ def GetMemberOp : CIR_Op<"get_member"> {
 // TODO(CIR): FuncOp is still a tiny shell of what it will become.  Many more
 // properties and attributes will be added as upstreaming continues.
 
-def FuncOp : CIR_Op<"func", [
+def CIR_FuncOp : CIR_Op<"func", [
   AutomaticAllocationScope, CallableOpInterface, FunctionOpInterface,
   DeclareOpInterfaceMethods<CIRGlobalValueInterface>,
   IsolatedFromAbove
@@ -2029,10 +2048,10 @@ def CIR_SideEffect : CIR_I32EnumAttr<
 }
 
 class CIR_CallOpBase<string mnemonic, list<Trait> extra_traits = []>
-    : Op<CIR_Dialect, mnemonic,
-         !listconcat(extra_traits,
-                     [DeclareOpInterfaceMethods<CIRCallOpInterface>,
-                      DeclareOpInterfaceMethods<SymbolUserOpInterface>])> {
+    : CIR_Op<mnemonic, !listconcat(extra_traits, [
+        DeclareOpInterfaceMethods<CIRCallOpInterface>,
+        DeclareOpInterfaceMethods<SymbolUserOpInterface>
+      ])> {
   let extraClassDeclaration = [{
     /// Get the argument operands to the called function.
     mlir::OperandRange getArgOperands();
@@ -2086,7 +2105,7 @@ class CIR_CallOpBase<string mnemonic, list<Trait> extra_traits = []>
       DefaultValuedAttr<CIR_SideEffect, "SideEffect::All">:$side_effect);
 }
 
-def CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> {
+def CIR_CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> {
   let summary = "call a function";
   let description = [{
     The `cir.call` operation represents a function call. It could represent
@@ -2127,7 +2146,7 @@ def CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> {
 // StackSaveOp & StackRestoreOp
 //===----------------------------------------------------------------------===//
 
-def StackSaveOp : CIR_Op<"stacksave"> {
+def CIR_StackSaveOp : CIR_Op<"stacksave"> {
   let summary = "remembers the current state of the function stack";
   let description = [{
     Saves current state of the function stack. Returns a pointer to an opaque object
@@ -2145,7 +2164,7 @@ def StackSaveOp : CIR_Op<"stacksave"> {
   let assemblyFormat = "attr-dict `:` qualified(type($result))";
 }
 
-def StackRestoreOp : CIR_Op<"stackrestore"> {
+def CIR_StackRestoreOp : CIR_Op<"stackrestore"> {
   let summary = "restores the state of the function stack";
   let description = [{
     Restore the state of the function stack to the state it was
@@ -2171,7 +2190,7 @@ def StackRestoreOp : CIR_Op<"stackrestore"> {
 // UnreachableOp
 //===----------------------------------------------------------------------===//
 
-def UnreachableOp : CIR_Op<"unreachable", [Terminator]> {
+def CIR_UnreachableOp : CIR_Op<"unreachable", [Terminator]> {
   let summary = "invoke immediate undefined behavior";
   let description = [{
     If the program control flow reaches a `cir.unreachable` operation, the
@@ -2187,7 +2206,7 @@ def UnreachableOp : CIR_Op<"unreachable", [Terminator]> {
 // TrapOp
 //===----------------------------------------------------------------------===//
 
-def TrapOp : CIR_Op<"trap", [Terminator]> {
+def CIR_TrapOp : CIR_Op<"trap", [Terminator]> {
   let summary = "Exit the program abnormally";
   let description = [{
     The cir.trap operation causes the program to exit abnormally. The
@@ -2204,8 +2223,7 @@ def TrapOp : CIR_Op<"trap", [Terminator]> {
 // VecCreate
 //===----------------------------------------------------------------------===//
 
-def VecCreateOp : CIR_Op<"vec.create", [Pure]> {
-
+def CIR_VecCreateOp : CIR_Op<"vec.create", [Pure]> {
   let summary = "Create a vector value";
   let description = [{
     The `cir.vec.create` operation creates a vector value with the given element
@@ -2229,11 +2247,12 @@ def VecCreateOp : CIR_Op<"vec.create", [Pure]> {
 // VecInsertOp
 //===----------------------------------------------------------------------===//
 
-def VecInsertOp : CIR_Op<"vec.insert", [Pure,
-  TypesMatchWith<"argument type matches vector element type", "vec", "value",
-                 "cast<VectorType>($_self).getElementType()">,
-  AllTypesMatch<["result", "vec"]>]> {
-
+def CIR_VecInsertOp : CIR_Op<"vec.insert", [
+  Pure,
+  TypesMatchWith<"argument type matches vector element type",
+    "vec", "value", "mlir::cast<cir::VectorType>($_self).getElementType()">,
+  AllTypesMatch<["result", "vec"]>
+]> {
   let summary = "Insert one element into a vector object";
   let description = [{
     The `cir.vec.insert` operation produces a new vector by replacing
@@ -2265,10 +2284,11 @@ def VecInsertOp : CIR_Op<"vec.insert", [Pure,
 // VecExtractOp
 //===----------------------------------------------------------------------===//
 
-def VecExtractOp : CIR_Op<"vec.extract", [Pure,
-  TypesMatchWith<"type of 'result' matches element type of 'vec'", "vec",
-                 "result", "cast<VectorType>($_self).getElementType()">]> {
-
+def CIR_VecExtractOp : CIR_Op<"vec.extract", [
+  Pure,
+  TypesMatchWith<"type of 'result' matches element type of 'vec'",
+    "vec", "result", "mlir::cast<cir::VectorType>($_self).getElementType()">
+]> {
   let summary = "Extract one element from a vector object";
   let description = [{
     The `cir.vec.extract` operation extracts the element at the given index
@@ -2295,8 +2315,7 @@ def VecExtractOp : CIR_Op<"vec.extract", [Pure,
 // VecCmpOp
 //===----------------------------------------------------------------------===//
 
-def VecCmpOp : CIR_Op<"vec.cmp", [Pure, SameTypeOperands]> {
-
+def CIR_VecCmpOp : CIR_Op<"vec.cmp", [Pure, SameTypeOperands]> {
   let summary = "Compare two vectors";
   let description = [{
     The `cir.vec.cmp` operation does an element-wise comparison of two vectors
@@ -2334,8 +2353,9 @@ def VecCmpOp : CIR_Op<"vec.cmp", [Pure, SameTypeOperands]> {
 // implement.  This could be useful for passes that don't care how the vector
 // shuffle was specified.
 
-def VecShuffleOp : CIR_Op<"vec.shuffle",
-                   [Pure, AllTypesMatch<["vec1", "vec2"]>]> {
+def CIR_VecShuffleOp : CIR_Op<"vec.shuffle", [
+  Pure, AllTypesMatch<["vec1", "vec2"]>
+]> {
   let summary = "Combine two vectors using indices passed as constant integers";
   let description = [{
     The `cir.vec.shuffle` operation implements the documented form of Clang's
@@ -2378,8 +2398,9 @@ def VecShuffleOp : CIR_Op<"vec.shuffle",
 // VecShuffleDynamicOp
 //===----------------------------------------------------------------------===//
 
-def VecShuffleDynamicOp : CIR_Op<"vec.shuffle.dynamic",
-                          [Pure, AllTypesMatch<["vec", "result"]>]> {
+def CIR_VecShuffleDynamicOp : CIR_Op<"vec.shuffle.dynamic", [
+  Pure, AllTypesMatch<["vec", "result"]>
+]> {
   let summary = "Shuffle a vector using indices in another vector";
   let description = [{
     The `cir.vec.shuffle.dynamic` operation implements the undocumented form of
@@ -2413,8 +2434,9 @@ def VecShuffleDynamicOp : CIR_Op<"vec.shuffle.dynamic",
 // VecTernaryOp
 //===----------------------------------------------------------------------===//
 
-def VecTernaryOp : CIR_Op<"vec.ternary",
-                   [Pure, AllTypesMatch<["result", "lhs", "rhs"]>]> {
+def CIR_VecTernaryOp : CIR_Op<"vec.ternary", [
+  Pure, AllTypesMatch<["result", "lhs", "rhs"]>
+]> {
   let summary = "The `cond ? a : b` ternary operator for vector types";
   let description = [{
     The `cir.vec.ternary` operation represents the C/C++ ternary operator,
@@ -2451,10 +2473,11 @@ def VecTernaryOp : CIR_Op<"vec.ternary",
 // VecSplatOp
 //===----------------------------------------------------------------------===//
 
-def VecSplatOp : CIR_Op<"vec.splat", [Pure,
-  TypesMatchWith<"type of 'value' matches element type of 'result'", "result",
-                 "value", "cast<VectorType>($_self).getElementType()">]> {
-
+def CIR_VecSplatOp : CIR_Op<"vec.splat", [
+  Pure,
+  TypesMatchWith<"type of 'value' matches element type of 'result'",
+    "result", "value", "mlir::cast<cir::VectorType>($_self).getElementType()">
+]> {
   let summary = "Convert a scalar into a vector";
   let description = [{
     The `cir.vec.splat` operation creates a vector value from a scalar value.
@@ -2483,7 +2506,7 @@ def VecSplatOp : CIR_Op<"vec.splat", [Pure,
 // BaseClassAddrOp
 //===----------------------------------------------------------------------===//
 
-def BaseClassAddrOp : CIR_Op<"base_class_addr"> {
+def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
   let summary = "Get the base class address for a class/struct";
   let description = [{
     The `cir.base_class_addr` operaration gets the address of a particular
@@ -2526,7 +2549,7 @@ def BaseClassAddrOp : CIR_Op<"base_class_addr"> {
 // ComplexCreateOp
 //===----------------------------------------------------------------------===//
 
-def ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
+def CIR_ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
   let summary = "Create a complex value from its real and imaginary parts";
   let description = [{
     The `cir.complex.create` operation takes two operands that represent the
@@ -2558,7 +2581,7 @@ def ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
 // ComplexRealOp
 //===----------------------------------------------------------------------===//
 
-def ComplexRealOp : CIR_Op<"complex.real", [Pure]> {
+def CIR_ComplexRealOp : CIR_Op<"complex.real", [Pure]> {
   let summary = "Extract the real part of a complex value";
   let description = [{
     `cir.complex.real` operation takes an operand of `!cir.complex` type and
@@ -2587,7 +2610,7 @@ def ComplexRealOp : CIR_Op<"complex.real", [Pure]> {
 // ComplexImagOp
 //===----------------------------------------------------------------------===//
 
-def ComplexImagOp : CIR_Op<"complex.imag", [Pure]> {
+def CIR_ComplexImagOp : CIR_Op<"complex.imag", [Pure]> {
   let summary = "Extract the imaginary part of a complex value";
   let description = [{
     `cir.complex.imag` operation takes an operand of `!cir.complex` type and
@@ -2616,7 +2639,7 @@ def ComplexImagOp : CIR_Op<"complex.imag", [Pure]> {
 // ComplexRealPtrOp
 //===----------------------------------------------------------------------===//
 
-def ComplexRealPtrOp : CIR_Op<"complex.real_ptr", [Pure]> {
+def CIR_ComplexRealPtrOp : CIR_Op<"complex.real_ptr", [Pure]> {
   let summary = "Derive a pointer to the real part of a complex value";
   let description = [{
     `cir.complex.real_ptr` operation takes a pointer operand that points to a
@@ -2646,7 +2669,7 @@ def ComplexRealPtrOp : CIR_Op<"complex.real_ptr", [Pure]> {
 // ComplexImagPtrOp
 //===----------------------------------------------------------------------===//
 
-def ComplexImagPtrOp : CIR_Op<"complex.imag_ptr", [Pure]> {
+def CIR_ComplexImagPtrOp : CIR_Op<"complex.imag_ptr", [Pure]> {
   let summary = "Derive a pointer to the imaginary part of a complex value";
   let description = [{
     `cir.complex.imag_ptr` operation takes a pointer operand that points to a
@@ -2676,7 +2699,9 @@ def ComplexImagPtrOp : CIR_Op<"complex.imag_ptr", [Pure]> {
 // ComplexAddOp
 //===----------------------------------------------------------------------===//
 
-def ComplexAddOp : CIR_Op<"complex.add", [Pure, SameOperandsAndResultType]> {
+def CIR_ComplexAddOp : CIR_Op<"complex.add", [
+  Pure, SameOperandsAndResultType
+]> {
   let summary = "Complex addition";
   let description = [{
     The `cir.complex.add` operation takes two complex numbers and returns
@@ -2702,7 +2727,9 @@ def ComplexAddOp : CIR_Op<"complex.add", [Pure, SameOperandsAndResultType]> {
 // ComplexSubOp
 //===----------------------------------------------------------------------===//
 
-def ComplexSubOp : CIR_Op<"complex.sub", [Pure, SameOperandsAndResultType]> {
+def CIR_ComplexSubOp : CIR_Op<"complex.sub", [
+  Pure, SameOperandsAndResultType
+]> {
   let summary = "Complex subtraction";
   let description = [{
     The `cir.complex.sub` operation takes two complex numbers and returns
@@ -2730,7 +2757,7 @@ def ComplexSubOp : CIR_Op<"complex.sub", [Pure, SameOperandsAndResultType]> {
 //===----------------------------------------------------------------------===//
 
 class CIR_BitOpBase<string mnemonic, TypeConstraint operandTy>
-  : CIR_Op<mnemonic, [Pure, SameOperandsAndResultType]> {
+    : CIR_Op<mnemonic, [Pure, SameOperandsAndResultType]> {
   let arguments = (ins operandTy:$input);
   let results = (outs operandTy:$result);
 
@@ -2740,7 +2767,7 @@ class CIR_BitOpBase<string mnemonic, TypeConstraint operandTy>
 }
 
 class CIR_BitZeroCountOpBase<string mnemonic, TypeConstraint operandTy>
-  : CIR_BitOpBase<mnemonic, operandTy> {
+    : CIR_BitOpBase<mnemonic, operandTy> {
   let arguments = (ins operandTy:$input, UnitAttr:$poison_zero);
 
   let assemblyFormat = [{
@@ -2749,7 +2776,7 @@ class CIR_BitZeroCountOpBase<string mnemonic, TypeConstraint operandTy>
   }];
 }
 
-def BitClrsbOp : CIR_BitOpBase<"clrsb", CIR_SIntOfWidths<[32, 64]>> {
+def CIR_BitClrsbOp : CIR_BitOpBase<"clrsb", CIR_SIntOfWidths<[32, 64]>> {
   let summary = "Get the number of leading redundant sign bits in the input";
   let description = [{
     Compute the number of leading redundant sign bits in the input integer.
@@ -2779,7 +2806,9 @@ def BitClrsbOp : CIR_BitOpBase<"clrsb", CIR_SIntOfWidths<[32, 64]>> {
   }];
 }
 
-def BitClzOp : CIR_BitZeroCountOpBase<"clz", CIR_UIntOfWidths<[16, 32, 64]>> {
+def CIR_BitClzOp : CIR_BitZeroCountOpBase<"clz",
+  CIR_UIntOfWidths<[16, 32, 64]>
+> {
   let summary = "Get the number of leading 0-bits in the input";
   let description = [{
     Compute the number of leading 0-bits in the input.
@@ -2802,7 +2831,9 @@ def BitClzOp : CIR_BitZeroCountOpBase<"clz", CIR_UIntOfWidths<[16, 32, 64]>> {
   }];
 }
 
-def BitCtzOp : CIR_BitZeroCountOpBase<"ctz", CIR_UIntOfWidths<[16, 32, 64]>> {
+def CIR_BitCtzOp : CIR_BitZeroCountOpBase<"ctz",
+  CIR_UIntOfWidths<[16, 32, 64]>
+> {
   let summary = "Get the number of trailing 0-bits in the input";
   let description = [{
     Compute the number of trailing 0-bits in the input.
@@ -2825,7 +2856,7 @@ def BitCtzOp : CIR_BitZeroCountOpBase<"ctz", CIR_UIntOfWidths<[16, 32, 64]>> {
   }];
 }
 
-def BitParityOp : CIR_BitOpBase<"parity", CIR_UIntOfWidths<[32, 64]>> {
+def CIR_BitParityOp : CIR_BitOpBase<"parity", CIR_UIntOfWidths<[32, 64]>> {
   let summary = "Get the parity of input";
   let description = [{
     Compute the parity of the input. The parity of an integer is the number of
@@ -2844,7 +2875,9 @@ def BitParityOp : CIR_BitOpBase<"parity", CIR_UIntOfWidths<[32, 64]>> {
   }];
 }
 
-def BitPopcountOp : CIR_BitOpBase<"popcount", CIR_UIntOfWidths<[16, 32, 64]>> {
+def CIR_BitPopcountOp : CIR_BitOpBase<"popcount",
+  CIR_UIntOfWidths<[16, 32, 64]>
+> {
   let summary = "Get the number of 1-bits in input";
   let description = [{
     Compute the number of 1-bits in the input.
@@ -2862,8 +2895,9 @@ def BitPopcountOp : CIR_BitOpBase<"popcount", CIR_UIntOfWidths<[16, 32, 64]>> {
   }];
 }
 
-def BitReverseOp : CIR_BitOpBase<"bitreverse",
-                                 CIR_UIntOfWidths<[8, 16, 32, 64]>> {
+def CIR_BitReverseOp : CIR_BitOpBase<"bitreverse",
+  CIR_UIntOfWidths<[8, 16, 32, 64]>
+> {
   let summary = "Reverse the bit pattern of the operand integer";
   let description = [{
     The `cir.bitreverse` operation reverses the bits of the operand integer. Its
@@ -2877,7 +2911,9 @@ def BitReverseOp : CIR_BitOpBase<"bitreverse",
   }];
 }
 
-def ByteSwapOp : CIR_BitOpBase<"byte_swap", CIR_UIntOfWidths<[16, 32, 64]>> {
+def CIR_ByteSwapOp : CIR_BitOpBase<"byte_swap",
+  CIR_UIntOfWidths<[16, 32, 64]>
+> {
   let summary = "Reverse the bytes in the object representation of the operand";
   let description = [{
     The `cir.byte_swap` operation takes an integer as operand, reverse the bytes
@@ -2898,11 +2934,50 @@ def ByteSwapOp : CIR_BitOpBase<"byte_swap", CIR_UIntOfWidths<[16, 32, 64]>> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// RotateOp
+//===----------------------------------------------------------------------===//
+
+def CIR_RotateOp : CIR_Op<"rotate", [Pure, SameOperandsAndResultType]> {
+  let summary = "Rotate the bits in the operand integer";
+  let description = [{
+    The `cir.rotate` rotates the bits in `input` by the given amount `amount`.
+    The rotate direction is specified by the `left` and `right` keyword.
+
+    `input` must be an unsigned integer and its width must be either 8, 16, 32,
+    or 64. The types of `input`, `amount`, and the result must all match.
+
+    Example:
+
+    ```mlir
+    %r = cir.rotate left %0, %1 : !u32i
+    %r = cir.rotate right %0, %1 : !u32i
+    ```
+  }];
+
+  let results = (outs CIR_IntType:$result);
+  let arguments = (ins
+    CIR_UIntOfWidths<[8, 16, 32, 64]>:$input,
+    CIR_IntType:$amount,
+    UnitAttr:$rotateLeft
+  );
+
+  let assemblyFormat = [{
+    (`left` $rotateLeft^) : (`right`)?
+    $input `,` $amount `:` type($result) attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    bool isRotateLeft() { return getRotateLeft(); }
+    bool isRotateRight() { return !getRotateLeft(); }
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Assume Operations
 //===----------------------------------------------------------------------===//
 
-def AssumeOp : CIR_Op<"assume"> {
+def CIR_AssumeOp : CIR_Op<"assume"> {
   let summary = "Tell the optimizer that a boolean value is true";
   let description = [{
     The `cir.assume` operation takes a single boolean prediate as its only
@@ -2924,8 +2999,9 @@ def AssumeOp : CIR_Op<"assume"> {
 // Branch Probability Operations
 //===----------------------------------------------------------------------===//
 
-def ExpectOp : CIR_Op<"expect",
-  [Pure, AllTypesMatch<["result", "val", "expected"]>]> {
+def CIR_ExpectOp : CIR_Op<"expect", [
+  Pure, AllTypesMatch<["result", "val", "expected"]>
+]> {
   let summary = "Tell the optimizer that two values are likely to be equal.";
   let description = [{
     The `cir.expect` operation may take 2 or 3 arguments.
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.h b/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
index 7f9fb9ef388b3..bfa165cdd945e 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_CIR_IR_CIRTYPES_H_
-#define MLIR_DIALECT_CIR_IR_CIRTYPES_H_
+#ifndef CLANG_CIR_DIALECT_IR_CIRTYPES_H
+#define CLANG_CIR_DIALECT_IR_CIRTYPES_H
 
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Types.h"
@@ -50,4 +50,4 @@ namespace cir {
 #define GET_TYPEDEF_CLASSES
 #include "clang/CIR/Dialect/IR/CIROpsTypes.h.inc"
 
-#endif // MLIR_DIALECT_CIR_IR_CIRTYPES_H_
+#endif // CLANG_CIR_DIALECT_IR_CIRTYPES_H
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index 37c32d9889e8b..edd21b55640b9 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_CIR_DIALECT_CIR_TYPES
-#define MLIR_CIR_DIALECT_CIR_TYPES
+#ifndef CLANG_CIR_DIALECT_IR_CIRTYPES_TD
+#define CLANG_CIR_DIALECT_IR_CIRTYPES_TD
 
 include "clang/CIR/Dialect/IR/CIRDialect.td"
 include "clang/CIR/Dialect/IR/CIRTypeConstraints.td"
@@ -638,4 +638,4 @@ def CIR_AnyType : AnyTypeOf<[
   CIR_ComplexType
 ]>;
 
-#endif // MLIR_CIR_DIALECT_CIR_TYPES
+#endif // CLANG_CIR_DIALECT_IR_CIRTYPES_TD
diff --git a/clang/include/clang/CIR/Dialect/Passes.h b/clang/include/clang/CIR/Dialect/Passes.h
index dbecf81acf7bb..02210ec0a8336 100644
--- a/clang/include/clang/CIR/Dialect/Passes.h
+++ b/clang/include/clang/CIR/Dialect/Passes.h
@@ -24,6 +24,7 @@ std::unique_ptr<Pass> createCIRCanonicalizePass();
 std::unique_ptr<Pass> createCIRFlattenCFGPass();
 std::unique_ptr<Pass> createCIRSimplifyPass();
 std::unique_ptr<Pass> createHoistAllocasPass();
+std::unique_ptr<Pass> createLoweringPreparePass();
 
 void populateCIRPreLoweringPasses(mlir::OpPassManager &pm);
 
diff --git a/clang/include/clang/CIR/Dialect/Passes.td b/clang/include/clang/CIR/Dialect/Passes.td
index de775e69f0073..7d5ec2ffed39d 100644
--- a/clang/include/clang/CIR/Dialect/Passes.td
+++ b/clang/include/clang/CIR/Dialect/Passes.td
@@ -33,14 +33,14 @@ def CIRSimplify : Pass<"cir-simplify"> {
   let summary = "Performs CIR simplification and code optimization";
   let description = [{
     The pass performs semantics-preserving code simplifications and optimizations
-    on CIR while maintaining strict program correctness. 
-    
+    on CIR while maintaining strict program correctness.
+
     Unlike the `cir-canonicalize` pass, these transformations may reduce the IR's
     structural similarity to the original source code as a trade-off for improved
     code quality. This can affect debugging fidelity by altering intermediate
-    representations of folded expressions, hoisted operations, and other 
+    representations of folded expressions, hoisted operations, and other
     optimized constructs.
-    
+
     Example transformations include ternary expression folding and code hoisting
     while preserving program semantics.
   }];
@@ -72,4 +72,15 @@ def CIRFlattenCFG : Pass<"cir-flatten-cfg"> {
   let dependentDialects = ["cir::CIRDialect"];
 }
 
+def LoweringPrepare : Pass<"cir-lowering-prepare"> {
+  let summary = "Lower to more fine-grained CIR operations before lowering to "
+    "other dialects";
+  let description = [{
+    This pass does preparation work for lowering to other dialects. For example,
+    it may expand the global variable initialziation in a more ABI-friendly form.
+  }];
+  let constructor = "mlir::createLoweringPreparePass()";
+  let dependentDialects = ["cir::CIRDialect"];
+}
+
 #endif // CLANG_CIR_DIALECT_PASSES_TD
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 48e309063d38b..182e4b6784d2f 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -254,6 +254,7 @@ struct MissingFeatures {
   static bool dtorCleanups() { return false; }
   static bool completeDtors() { return false; }
   static bool vtableInitialization() { return false; }
+  static bool msvcBuiltins() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d0b54a446309b..ef8a298e5ddb7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -990,6 +990,13 @@ def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
   Visibility<[ClangOption, CLOption, FlangOption]>,
   HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
   Group<Link_Group>;
+def Xthinlto_distributor_EQ : CommaJoined<["-"], "Xthinlto-distributor=">,
+  Flags<[LinkOption]>,
+  Visibility<[ClangOption, CLOption]>,
+  HelpText<"Pass <arg> to the ThinLTO distributor process. Can be specified "
+           "multiple times or with comma-separated values.">,
+  MetaVarName<"<arg>">,
+  Group<Link_Group>;
 def Xoffload_linker : JoinedAndSeparate<["-"], "Xoffload-linker">,
   Visibility<[ClangOption, FlangOption]>,
   HelpText<"Pass <arg> to the offload linkers or the ones identified by -<triple>">,
@@ -1415,19 +1422,6 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 }
 
-// Clang specific/exclusive options for OpenACC.
-def openacc_macro_override
-    : Separate<["-"], "fexperimental-openacc-macro-override">,
-      Visibility<[ClangOption, CC1Option]>,
-      Group<f_Group>,
-      HelpText<"Overrides the _OPENACC macro value for experimental testing "
-               "during OpenACC support development">;
-def openacc_macro_override_EQ
-    : Joined<["-"], "fexperimental-openacc-macro-override=">,
-      Alias<openacc_macro_override>;
-
-// End Clang specific/exclusive options for OpenACC.
-
 def libomptarget_amdgpu_bc_path_EQ : Joined<["--"], "libomptarget-amdgpu-bc-path=">, Group<i_Group>,
   HelpText<"Path to libomptarget-amdgcn bitcode library">;
 def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path=">, Group<i_Group>,
@@ -2169,10 +2163,10 @@ def fwasm_exceptions : Flag<["-"], "fwasm-exceptions">, Group<f_Group>,
   HelpText<"Use WebAssembly style exceptions">;
 def exception_model : Separate<["-"], "exception-model">,
   Visibility<[CC1Option]>, HelpText<"The exception model">,
-  Values<"dwarf,sjlj,seh,wasm">,
-  NormalizedValuesScope<"LangOptions::ExceptionHandlingKind">,
-  NormalizedValues<["DwarfCFI", "SjLj", "WinEH", "Wasm"]>,
-  MarshallingInfoEnum<LangOpts<"ExceptionHandling">, "None">;
+  Values<"dwarf,sjlj,seh,wasm,none">,
+  NormalizedValuesScope<"CodeGenOptions::ExceptionHandlingKind">,
+  NormalizedValues<["DwarfCFI", "SjLj", "WinEH", "Wasm", "None"]>,
+  MarshallingInfoEnum<CodeGenOpts<"ExceptionHandling">, "None">;
 def exception_model_EQ : Joined<["-"], "exception-model=">,
   Visibility<[CC1Option]>, Alias<exception_model>;
 def fignore_exceptions : Flag<["-"], "fignore-exceptions">, Group<f_Group>,
@@ -4249,7 +4243,12 @@ def ffinite_loops: Flag<["-"],  "ffinite-loops">, Group<f_Group>,
 def fno_finite_loops: Flag<["-"], "fno-finite-loops">, Group<f_Group>,
   HelpText<"Do not assume that any loop is finite.">,
   Visibility<[ClangOption, CC1Option]>;
-
+def fthinlto_distributor_EQ : Joined<["-"], "fthinlto-distributor=">,
+  Group<f_Group>,
+  HelpText<"Path to the ThinLTO distributor process. If specified, "
+           "ThinLTO backend compilations will be distributed by LLD">,
+  MetaVarName<"<path>">,
+  Visibility<[ClangOption, CLOption]>;
 def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
   HelpText<"Process trigraph sequences">, Visibility<[ClangOption, CC1Option]>;
 def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
@@ -4504,6 +4503,9 @@ defm ptrauth_init_fini_address_discrimination : OptInCC1FFlag<"ptrauth-init-fini
   "Enable address discrimination of function pointers in init/fini arrays">;
 defm ptrauth_elf_got : OptInCC1FFlag<"ptrauth-elf-got", "Enable authentication of pointers from GOT (ELF only)">;
 defm aarch64_jump_table_hardening: OptInCC1FFlag<"aarch64-jump-table-hardening", "Use hardened lowering for jump-table dispatch">;
+defm ptrauth_objc_isa : OptInCC1FFlag<"ptrauth-objc-isa", "Enable signing and authentication of Objective-C object's 'isa' field">;
+defm ptrauth_objc_interface_sel : OptInCC1FFlag<"ptrauth-objc-interface-sel", "Enable signing and authentication of Objective-C object's 'SEL' fields">;
+defm ptrauth_objc_class_ro : OptInCC1FFlag<"ptrauth-objc-class-ro", "Enable signing and authentication for ObjC class_ro pointers">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
@@ -5901,6 +5903,10 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">,
   MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
+// Facebook T92898286
+def post_link_optimize : Flag<["--"], "post-link-optimize">,
+  HelpText<"Apply post-link optimizations using BOLT">;
+// End Facebook T92898286
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index b4f2a87fe7e83..7677604484f52 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4385,8 +4385,18 @@ struct FormatStyle {
     ///    #include "B/a.h"           #include "a/b.h"
     /// \endcode
     bool IgnoreCase;
+    /// When sorting includes in each block, only take file extensions into
+    /// account if two includes compare equal otherwise.
+    /// \code
+    ///    true:                          false:
+    ///    # include "A.h"         vs.    # include "A-util.h"
+    ///    # include "A.inc"              # include "A.h"
+    ///    # include "A-util.h"           # include "A.inc"
+    /// \endcode
+    bool IgnoreExtension;
     bool operator==(const SortIncludesOptions &R) const {
-      return Enabled == R.Enabled && IgnoreCase == R.IgnoreCase;
+      return Enabled == R.Enabled && IgnoreCase == R.IgnoreCase &&
+             IgnoreExtension == R.IgnoreExtension;
     }
     bool operator!=(const SortIncludesOptions &R) const {
       return !(*this == R);
diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h
index 1485192e8f1e3..1286fe4cca0f4 100644
--- a/clang/include/clang/Frontend/ASTUnit.h
+++ b/clang/include/clang/Frontend/ASTUnit.h
@@ -62,6 +62,7 @@ class ASTContext;
 class ASTDeserializationListener;
 class ASTMutationListener;
 class ASTReader;
+class CodeGenOptions;
 class CompilerInstance;
 class CompilerInvocation;
 class Decl;
@@ -107,6 +108,7 @@ class ASTUnit {
 
 private:
   std::unique_ptr<LangOptions> LangOpts;
+  std::unique_ptr<CodeGenOptions> CodeGenOpts;
   // FIXME: The documentation on \c LoadFrom* member functions states that the
   // DiagnosticsEngine (and therefore DiagnosticOptions) must outlive the
   // returned ASTUnit. This is not the case. Enfore it by storing non-owning
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index 0ae490f0e8073..2408367e19833 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -720,6 +720,7 @@ class CompilerInstance : public ModuleLoader {
       DisableValidationForModuleKind DisableValidation,
       bool AllowPCHWithCompilerErrors, Preprocessor &PP, ModuleCache &ModCache,
       ASTContext &Context, const PCHContainerReader &PCHContainerRdr,
+      const CodeGenOptions &CodeGenOpts,
       ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
       ArrayRef<std::shared_ptr<DependencyCollector>> DependencyCollectors,
       void *DeserializationListener, bool OwnDeserializationListener,
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index 567ad2d5934d4..e5680813e74de 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -44,8 +44,7 @@ namespace clang {
   class TypeLoc;
   class LangOptions;
   class IdentifierInfo;
-  class NamespaceAliasDecl;
-  class NamespaceDecl;
+  class NamespaceBaseDecl;
   class ObjCDeclSpec;
   class Sema;
   class Declarator;
@@ -129,29 +128,15 @@ class CXXScopeSpec {
   /// \param Context The AST context in which this nested-name-specifier
   /// resides.
   ///
-  /// \param Namespace The namespace.
+  /// \param Namespace The namespace or the namespace alias.
   ///
-  /// \param NamespaceLoc The location of the namespace name.
+  /// \param NamespaceLoc The location of the namespace name or the namespace
+  /// alias.
   ///
   /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceDecl *Namespace,
+  void Extend(ASTContext &Context, NamespaceBaseDecl *Namespace,
               SourceLocation NamespaceLoc, SourceLocation ColonColonLoc);
 
-  /// Extend the current nested-name-specifier by another
-  /// nested-name-specifier component of the form 'namespace-alias::'.
-  ///
-  /// \param Context The AST context in which this nested-name-specifier
-  /// resides.
-  ///
-  /// \param Alias The namespace alias.
-  ///
-  /// \param AliasLoc The location of the namespace alias
-  /// name.
-  ///
-  /// \param ColonColonLoc The location of the trailing '::'.
-  void Extend(ASTContext &Context, NamespaceAliasDecl *Alias,
-              SourceLocation AliasLoc, SourceLocation ColonColonLoc);
-
   /// Turn this (empty) nested-name-specifier into the global
   /// nested-name-specifier '::'.
   void MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc);
@@ -1918,7 +1903,7 @@ class Declarator {
   /// parsed.  This is pushed from the identifier out, which means that element
   /// #0 will be the most closely bound to the identifier, and
   /// DeclTypeInfo.back() will be the least closely bound.
-  SmallVector<DeclaratorChunk, 8> DeclTypeInfo;
+  SmallVector<DeclaratorChunk, 4> DeclTypeInfo;
 
   /// InvalidType - Set by Sema::GetTypeForDeclarator().
   LLVM_PREFERRED_TYPE(bool)
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 18d52782db2c5..2edee7eaa19a1 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -678,7 +678,7 @@ class AttributePool {
   friend class AttributeFactory;
   friend class ParsedAttributes;
   AttributeFactory &Factory;
-  llvm::SmallVector<ParsedAttr *> Attrs;
+  llvm::SmallVector<ParsedAttr *, 2> Attrs;
 
   void *allocate(size_t size) {
     return Factory.allocate(size);
@@ -808,7 +808,7 @@ class AttributePool {
 
 class ParsedAttributesView {
   friend class AttributePool;
-  using VecTy = llvm::SmallVector<ParsedAttr *>;
+  using VecTy = llvm::SmallVector<ParsedAttr *, 2>;
   using SizeType = decltype(std::declval<VecTy>().size());
 
 public:
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 9d265f27b8e31..441047d64f48c 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -44,7 +44,7 @@ namespace serialization {
 /// Version 4 of AST files also requires that the version control branch and
 /// revision match exactly, since there is no backward compatibility of
 /// AST files at this time.
-const unsigned VERSION_MAJOR = 34;
+const unsigned VERSION_MAJOR = 35;
 
 /// AST file minor version number supported by this version of
 /// Clang.
@@ -399,6 +399,9 @@ enum OptionsRecordTypes {
 
   /// Record code for the preprocessor options table.
   PREPROCESSOR_OPTIONS,
+
+  /// Record code for the codegen options table.
+  CODEGEN_OPTIONS,
 };
 
 /// Record codes for the unhashed control block.
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 3d96f8e65180d..7ce626cedae74 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -72,6 +72,7 @@ class ASTContext;
 class ASTDeserializationListener;
 class ASTReader;
 class ASTRecordReader;
+class CodeGenOptions;
 class CXXTemporary;
 class Decl;
 class DeclarationName;
@@ -137,6 +138,15 @@ class ASTReaderListener {
     return false;
   }
 
+  /// Receives the codegen options.
+  ///
+  /// \returns true to indicate the options are invalid or false otherwise.
+  virtual bool ReadCodeGenOptions(const CodeGenOptions &CGOpts,
+                                  StringRef ModuleFilename, bool Complain,
+                                  bool AllowCompatibleDifferences) {
+    return false;
+  }
+
   /// Receives the target options.
   ///
   /// \returns true to indicate the target options are invalid, or false
@@ -281,6 +291,9 @@ class ChainedASTReaderListener : public ASTReaderListener {
   bool ReadLanguageOptions(const LangOptions &LangOpts,
                            StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
+  bool ReadCodeGenOptions(const CodeGenOptions &CGOpts,
+                          StringRef ModuleFilename, bool Complain,
+                          bool AllowCompatibleDifferences) override;
   bool ReadTargetOptions(const TargetOptions &TargetOpts,
                          StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
@@ -322,6 +335,9 @@ class PCHValidator : public ASTReaderListener {
   bool ReadLanguageOptions(const LangOptions &LangOpts,
                            StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
+  bool ReadCodeGenOptions(const CodeGenOptions &CGOpts,
+                          StringRef ModuleFilename, bool Complain,
+                          bool AllowCompatibleDifferences) override;
   bool ReadTargetOptions(const TargetOptions &TargetOpts,
                          StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
@@ -493,6 +509,9 @@ class ASTReader
   /// The AST consumer.
   ASTConsumer *Consumer = nullptr;
 
+  /// The codegen options.
+  const CodeGenOptions &CodeGenOpts;
+
   /// The module manager which manages modules and their dependencies
   ModuleManager ModuleMgr;
 
@@ -1580,6 +1599,10 @@ class ASTReader
                                    StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
                                    bool AllowCompatibleDifferences);
+  static bool ParseCodeGenOptions(const RecordData &Record,
+                                  StringRef ModuleFilename, bool Complain,
+                                  ASTReaderListener &Listener,
+                                  bool AllowCompatibleDifferences);
   static bool ParseTargetOptions(const RecordData &Record,
                                  StringRef ModuleFilename, bool Complain,
                                  ASTReaderListener &Listener,
@@ -1771,6 +1794,7 @@ class ASTReader
   /// deserializing.
   ASTReader(Preprocessor &PP, ModuleCache &ModCache, ASTContext *Context,
             const PCHContainerReader &PCHContainerRdr,
+            const CodeGenOptions &CodeGenOpts,
             ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
             StringRef isysroot = "",
             DisableValidationForModuleKind DisableValidationKind =
@@ -1789,6 +1813,7 @@ class ASTReader
   SourceManager &getSourceManager() const { return SourceMgr; }
   FileManager &getFileManager() const { return FileMgr; }
   DiagnosticsEngine &getDiags() const { return Diags; }
+  const CodeGenOptions &getCodeGenOpts() const { return CodeGenOpts; }
 
   /// Flags that indicate what kind of AST loading failures the client
   /// of the AST reader can directly handle.
@@ -1990,14 +2015,12 @@ class ASTReader
 
   /// Determine whether the given AST file is acceptable to load into a
   /// translation unit with the given language and target options.
-  static bool isAcceptableASTFile(StringRef Filename, FileManager &FileMgr,
-                                  const ModuleCache &ModCache,
-                                  const PCHContainerReader &PCHContainerRdr,
-                                  const LangOptions &LangOpts,
-                                  const TargetOptions &TargetOpts,
-                                  const PreprocessorOptions &PPOpts,
-                                  StringRef ExistingModuleCachePath,
-                                  bool RequireStrictOptionMatches = false);
+  static bool isAcceptableASTFile(
+      StringRef Filename, FileManager &FileMgr, const ModuleCache &ModCache,
+      const PCHContainerReader &PCHContainerRdr, const LangOptions &LangOpts,
+      const CodeGenOptions &CGOpts, const TargetOptions &TargetOpts,
+      const PreprocessorOptions &PPOpts, StringRef ExistingModuleCachePath,
+      bool RequireStrictOptionMatches = false);
 
   /// Returns the suggested contents of the predefines buffer,
   /// which contains a (typically-empty) subset of the predefines
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index c86019f01d9f3..edf5bbaddf1aa 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -49,6 +49,7 @@ namespace clang {
 class ASTContext;
 class ASTReader;
 class Attr;
+class CodeGenOptions;
 class CXXRecordDecl;
 class FileEntry;
 class FPOptionsOverride;
@@ -124,6 +125,8 @@ class ASTWriter : public ASTDeserializationListener,
   /// The PCM manager which manages memory buffers for pcm files.
   ModuleCache &ModCache;
 
+  const CodeGenOptions &CodeGenOpts;
+
   /// The preprocessor we're writing.
   Preprocessor *PP = nullptr;
 
@@ -686,13 +689,14 @@ class ASTWriter : public ASTDeserializationListener,
   /// Create a new precompiled header writer that outputs to
   /// the given bitstream.
   ASTWriter(llvm::BitstreamWriter &Stream, SmallVectorImpl<char> &Buffer,
-            ModuleCache &ModCache,
+            ModuleCache &ModCache, const CodeGenOptions &CodeGenOpts,
             ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
             bool IncludeTimestamps = true, bool BuildingImplicitModule = false,
             bool GeneratingReducedBMI = false);
   ~ASTWriter() override;
 
   const LangOptions &getLangOpts() const;
+  const CodeGenOptions &getCodeGenOpts() const { return CodeGenOpts; }
 
   /// Get a timestamp for output into the AST file. The actual timestamp
   /// of the specified file may be ignored if we have been instructed to not
@@ -999,6 +1003,7 @@ class PCHGenerator : public SemaConsumer {
 public:
   PCHGenerator(Preprocessor &PP, ModuleCache &ModCache, StringRef OutputFile,
                StringRef isysroot, std::shared_ptr<PCHBuffer> Buffer,
+               const CodeGenOptions &CodeGenOpts,
                ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
                bool AllowASTWithErrors = false, bool IncludeTimestamps = true,
                bool BuildingImplicitModule = false,
@@ -1021,13 +1026,14 @@ class CXX20ModulesGenerator : public PCHGenerator {
   virtual Module *getEmittingModule(ASTContext &Ctx) override;
 
   CXX20ModulesGenerator(Preprocessor &PP, ModuleCache &ModCache,
-                        StringRef OutputFile, bool GeneratingReducedBMI,
-                        bool AllowASTWithErrors);
+                        StringRef OutputFile, const CodeGenOptions &CodeGenOpts,
+                        bool GeneratingReducedBMI, bool AllowASTWithErrors);
 
 public:
   CXX20ModulesGenerator(Preprocessor &PP, ModuleCache &ModCache,
-                        StringRef OutputFile, bool AllowASTWithErrors = false)
-      : CXX20ModulesGenerator(PP, ModCache, OutputFile,
+                        StringRef OutputFile, const CodeGenOptions &CodeGenOpts,
+                        bool AllowASTWithErrors = false)
+      : CXX20ModulesGenerator(PP, ModCache, OutputFile, CodeGenOpts,
                               /*GeneratingReducedBMI=*/false,
                               AllowASTWithErrors) {}
 
@@ -1039,8 +1045,9 @@ class ReducedBMIGenerator : public CXX20ModulesGenerator {
 
 public:
   ReducedBMIGenerator(Preprocessor &PP, ModuleCache &ModCache,
-                      StringRef OutputFile, bool AllowASTWithErrors = false)
-      : CXX20ModulesGenerator(PP, ModCache, OutputFile,
+                      StringRef OutputFile, const CodeGenOptions &CodeGenOpts,
+                      bool AllowASTWithErrors = false)
+      : CXX20ModulesGenerator(PP, ModCache, OutputFile, CodeGenOpts,
                               /*GeneratingReducedBMI=*/true,
                               AllowASTWithErrors) {}
 };
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 6370586e218ef..fbb34340a5c67 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -499,9 +499,6 @@ class ExprEngine {
   void VisitGuardedExpr(const Expr *Ex, const Expr *L, const Expr *R,
                         ExplodedNode *Pred, ExplodedNodeSet &Dst);
 
-  void VisitInitListExpr(const InitListExpr *E, ExplodedNode *Pred,
-                         ExplodedNodeSet &Dst);
-
   /// VisitAttributedStmt - Transfer function logic for AttributedStmt.
   void VisitAttributedStmt(const AttributedStmt *A, ExplodedNode *Pred,
                            ExplodedNodeSet &Dst);
@@ -591,6 +588,10 @@ class ExprEngine {
                                 ExplodedNode *Pred,
                                 ExplodedNodeSet &Dst);
 
+  void ConstructInitList(const Expr *Source, ArrayRef<Expr *> Args,
+                         bool IsTransparent, ExplodedNode *Pred,
+                         ExplodedNodeSet &Dst);
+
   /// evalEagerlyAssumeBifurcation - Given the nodes in 'Src', eagerly assume
   /// concrete boolean values for 'Ex', storing the resulting nodes in 'Dst'.
   void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
diff --git a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
index 015dbba26f688..271232e66626e 100644
--- a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
+++ b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
@@ -115,7 +115,8 @@ class RecursiveSymbolVisitor
     // The base visitor will visit NNSL prefixes, so we should only look at
     // the current NNS.
     if (NNS) {
-      const NamespaceDecl *ND = NNS.getNestedNameSpecifier()->getAsNamespace();
+      const auto *ND = dyn_cast_if_present<NamespaceDecl>(
+          NNS.getNestedNameSpecifier()->getAsNamespace());
       if (!visit(ND, NNS.getLocalBeginLoc(), NNS.getLocalEndLoc()))
         return false;
     }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 679812adcdf12..232a4b6557b92 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -7387,21 +7387,9 @@ bool ASTContext::isSameDefaultTemplateArgument(const NamedDecl *X,
   return hasSameTemplateName(TAX.getAsTemplate(), TAY.getAsTemplate());
 }
 
-static NamespaceDecl *getNamespace(const NestedNameSpecifier *X) {
-  if (auto *NS = X->getAsNamespace())
-    return NS;
-  if (auto *NAS = X->getAsNamespaceAlias())
-    return NAS->getNamespace();
-  return nullptr;
-}
-
 static bool isSameQualifier(const NestedNameSpecifier *X,
                             const NestedNameSpecifier *Y) {
-  if (auto *NSX = getNamespace(X)) {
-    auto *NSY = getNamespace(Y);
-    if (!NSY || NSX->getCanonicalDecl() != NSY->getCanonicalDecl())
-      return false;
-  } else if (X->getKind() != Y->getKind())
+  if (X->getKind() != Y->getKind())
     return false;
 
   // FIXME: For namespaces and types, we're permitted to check that the entity
@@ -7412,8 +7400,8 @@ static bool isSameQualifier(const NestedNameSpecifier *X,
       return false;
     break;
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
-    // We've already checked that we named the same namespace.
+    if (!declaresSameEntity(X->getAsNamespace(), Y->getAsNamespace()))
+      return false;
     break;
   case NestedNameSpecifier::TypeSpec:
     if (X->getAsType()->getCanonicalTypeInternal() !=
@@ -7836,17 +7824,10 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const {
                                        NNS->getAsIdentifier());
 
   case NestedNameSpecifier::Namespace:
-    // A namespace is canonical; build a nested-name-specifier with
-    // this namespace and no prefix.
-    return NestedNameSpecifier::Create(*this, nullptr,
-                                       NNS->getAsNamespace()->getFirstDecl());
-
-  case NestedNameSpecifier::NamespaceAlias:
     // A namespace is canonical; build a nested-name-specifier with
     // this namespace and no prefix.
     return NestedNameSpecifier::Create(
-        *this, nullptr,
-        NNS->getAsNamespaceAlias()->getNamespace()->getFirstDecl());
+        *this, nullptr, NNS->getAsNamespace()->getNamespace()->getFirstDecl());
 
   // The difference between TypeSpec and TypeSpecWithTemplate is that the
   // latter will have the 'template' keyword when printed.
@@ -9783,6 +9764,17 @@ ObjCInterfaceDecl *ASTContext::getObjCProtocolDecl() const {
   return ObjCProtocolClassDecl;
 }
 
+PointerAuthQualifier ASTContext::getObjCMemberSelTypePtrAuth() {
+  if (!getLangOpts().PointerAuthObjcInterfaceSel)
+    return PointerAuthQualifier();
+  return PointerAuthQualifier::Create(
+      getLangOpts().PointerAuthObjcInterfaceSelKey,
+      /*isAddressDiscriminated=*/true, SelPointerConstantDiscriminator,
+      PointerAuthenticationMode::SignAndAuth,
+      /*isIsaPointer=*/false,
+      /*authenticatesNullValues=*/false);
+}
+
 //===----------------------------------------------------------------------===//
 // __builtin_va_list Construction Functions
 //===----------------------------------------------------------------------===//
@@ -9987,14 +9979,6 @@ CreateX86_64ABIBuiltinVaListDecl(const ASTContext *Context) {
   return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
 }
 
-static TypedefDecl *CreatePNaClABIBuiltinVaListDecl(const ASTContext *Context) {
-  // typedef int __builtin_va_list[4];
-  llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 4);
-  QualType IntArrayType = Context->getConstantArrayType(
-      Context->IntTy, Size, nullptr, ArraySizeModifier::Normal, 0);
-  return Context->buildImplicitTypedef(IntArrayType, "__builtin_va_list");
-}
-
 static TypedefDecl *
 CreateAAPCSABIBuiltinVaListDecl(const ASTContext *Context) {
   // struct __va_list
@@ -10192,8 +10176,6 @@ static TypedefDecl *CreateVaListDecl(const ASTContext *Context,
     return CreatePowerABIBuiltinVaListDecl(Context);
   case TargetInfo::X86_64ABIBuiltinVaList:
     return CreateX86_64ABIBuiltinVaListDecl(Context);
-  case TargetInfo::PNaClABIBuiltinVaList:
-    return CreatePNaClABIBuiltinVaListDecl(Context);
   case TargetInfo::AAPCSABIBuiltinVaList:
     return CreateAAPCSABIBuiltinVaListDecl(Context);
   case TargetInfo::SystemZBuiltinVaList:
@@ -12677,10 +12659,9 @@ QualType ASTContext::GetBuiltinType(unsigned Id,
 
   bool Variadic = (TypeStr[0] == '.');
 
-  FunctionType::ExtInfo EI(getDefaultCallingConvention(
-      Variadic, /*IsCXXMethod=*/false, /*IsBuiltin=*/true));
-  if (BuiltinInfo.isNoReturn(Id)) EI = EI.withNoReturn(true);
-
+  FunctionType::ExtInfo EI(Target->getDefaultCallingConv());
+  if (BuiltinInfo.isNoReturn(Id))
+    EI = EI.withNoReturn(true);
 
   // We really shouldn't be making a no-proto type here.
   if (ArgTypes.empty() && Variadic && !getLangOpts().requiresStrictPrototypes())
@@ -13064,43 +13045,38 @@ void ASTContext::forEachMultiversionedFunctionVersion(
 }
 
 CallingConv ASTContext::getDefaultCallingConvention(bool IsVariadic,
-                                                    bool IsCXXMethod,
-                                                    bool IsBuiltin) const {
+                                                    bool IsCXXMethod) const {
   // Pass through to the C++ ABI object
   if (IsCXXMethod)
     return ABI->getDefaultMethodCallConv(IsVariadic);
 
-  // Builtins ignore user-specified default calling convention and remain the
-  // Target's default calling convention.
-  if (!IsBuiltin) {
-    switch (LangOpts.getDefaultCallingConv()) {
-    case LangOptions::DCC_None:
-      break;
-    case LangOptions::DCC_CDecl:
-      return CC_C;
-    case LangOptions::DCC_FastCall:
-      if (getTargetInfo().hasFeature("sse2") && !IsVariadic)
-        return CC_X86FastCall;
-      break;
-    case LangOptions::DCC_StdCall:
-      if (!IsVariadic)
-        return CC_X86StdCall;
-      break;
-    case LangOptions::DCC_VectorCall:
-      // __vectorcall cannot be applied to variadic functions.
-      if (!IsVariadic)
-        return CC_X86VectorCall;
-      break;
-    case LangOptions::DCC_RegCall:
-      // __regcall cannot be applied to variadic functions.
-      if (!IsVariadic)
-        return CC_X86RegCall;
-      break;
-    case LangOptions::DCC_RtdCall:
-      if (!IsVariadic)
-        return CC_M68kRTD;
-      break;
-    }
+  switch (LangOpts.getDefaultCallingConv()) {
+  case LangOptions::DCC_None:
+    break;
+  case LangOptions::DCC_CDecl:
+    return CC_C;
+  case LangOptions::DCC_FastCall:
+    if (getTargetInfo().hasFeature("sse2") && !IsVariadic)
+      return CC_X86FastCall;
+    break;
+  case LangOptions::DCC_StdCall:
+    if (!IsVariadic)
+      return CC_X86StdCall;
+    break;
+  case LangOptions::DCC_VectorCall:
+    // __vectorcall cannot be applied to variadic functions.
+    if (!IsVariadic)
+      return CC_X86VectorCall;
+    break;
+  case LangOptions::DCC_RegCall:
+    // __regcall cannot be applied to variadic functions.
+    if (!IsVariadic)
+      return CC_X86RegCall;
+    break;
+  case LangOptions::DCC_RtdCall:
+    if (!IsVariadic)
+      return CC_M68kRTD;
+    break;
   }
   return Target->getDefaultCallingConv();
 }
@@ -13703,26 +13679,27 @@ static NestedNameSpecifier *getCommonNNS(ASTContext &Ctx,
     R = NestedNameSpecifier::Create(Ctx, P, II);
     break;
   }
-  case NestedNameSpecifier::SpecifierKind::Namespace:
-  case NestedNameSpecifier::SpecifierKind::NamespaceAlias: {
-    assert(K2 == NestedNameSpecifier::SpecifierKind::Namespace ||
-           K2 == NestedNameSpecifier::SpecifierKind::NamespaceAlias);
+  case NestedNameSpecifier::SpecifierKind::Namespace: {
+    assert(K2 == NestedNameSpecifier::SpecifierKind::Namespace);
     // The prefixes for namespaces are not significant, its declaration
     // identifies it uniquely.
     NestedNameSpecifier *P =
         ::getCommonNNS(Ctx, NNS1->getPrefix(), NNS2->getPrefix(),
                        /*IsSame=*/false);
-    NamespaceAliasDecl *A1 = NNS1->getAsNamespaceAlias(),
-                       *A2 = NNS2->getAsNamespaceAlias();
-    // Are they the same namespace alias?
-    if (declaresSameEntity(A1, A2)) {
-      R = NestedNameSpecifier::Create(Ctx, P, ::getCommonDeclChecked(A1, A2));
+    NamespaceBaseDecl *Namespace1 = NNS1->getAsNamespace(),
+                      *Namespace2 = NNS2->getAsNamespace();
+    auto Kind = Namespace1->getKind();
+    if (Kind != Namespace2->getKind() ||
+        (Kind == Decl::NamespaceAlias &&
+         !declaresSameEntity(Namespace1, Namespace2))) {
+      R = NestedNameSpecifier::Create(
+          Ctx, P,
+          ::getCommonDeclChecked(Namespace1->getNamespace(),
+                                 Namespace2->getNamespace()));
       break;
     }
-    // Otherwise, look at the namespaces only.
-    NamespaceDecl *N1 = A1 ? A1->getNamespace() : NNS1->getAsNamespace(),
-                  *N2 = A2 ? A2->getNamespace() : NNS2->getAsNamespace();
-    R = NestedNameSpecifier::Create(Ctx, P, ::getCommonDeclChecked(N1, N2));
+    R = NestedNameSpecifier::Create(
+        Ctx, P, ::getCommonDeclChecked(Namespace1, Namespace2));
     break;
   }
   case NestedNameSpecifier::SpecifierKind::TypeSpec: {
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 4d3bd985739fb..b5f6c5a8c6abe 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -10063,17 +10063,10 @@ ASTImporter::Import(NestedNameSpecifier *FromNNS) {
   case NestedNameSpecifier::Namespace:
     if (ExpectedDecl NSOrErr = Import(FromNNS->getAsNamespace())) {
       return NestedNameSpecifier::Create(ToContext, Prefix,
-                                         cast<NamespaceDecl>(*NSOrErr));
+                                         cast<NamespaceBaseDecl>(*NSOrErr));
     } else
       return NSOrErr.takeError();
 
-  case NestedNameSpecifier::NamespaceAlias:
-    if (ExpectedDecl NSADOrErr = Import(FromNNS->getAsNamespaceAlias()))
-      return NestedNameSpecifier::Create(ToContext, Prefix,
-                                         cast<NamespaceAliasDecl>(*NSADOrErr));
-    else
-      return NSADOrErr.takeError();
-
   case NestedNameSpecifier::Global:
     return NestedNameSpecifier::GlobalSpecifier(ToContext);
 
@@ -10139,11 +10132,6 @@ ASTImporter::Import(NestedNameSpecifierLoc FromNNS) {
                      ToLocalEndLoc);
       break;
 
-    case NestedNameSpecifier::NamespaceAlias:
-      Builder.Extend(getToContext(), Spec->getAsNamespaceAlias(),
-                     ToLocalBeginLoc, ToLocalEndLoc);
-      break;
-
     case NestedNameSpecifier::TypeSpec: {
       SourceLocation ToTLoc;
       if (Error Err = importInto(ToTLoc, NNS.getTypeLoc().getBeginLoc()))
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 3aa6b37844103..289c6d7737de7 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -598,9 +598,6 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
   case NestedNameSpecifier::Namespace:
     return IsStructurallyEquivalent(Context, NNS1->getAsNamespace(),
                                     NNS2->getAsNamespace());
-  case NestedNameSpecifier::NamespaceAlias:
-    return IsStructurallyEquivalent(Context, NNS1->getAsNamespaceAlias(),
-                                    NNS2->getAsNamespaceAlias());
   case NestedNameSpecifier::TypeSpec:
     return IsStructurallyEquivalent(Context, QualType(NNS1->getAsType(), 0),
                                     QualType(NNS2->getAsType(), 0));
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index afa3b7ea7de7e..ea473730350b6 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -25,6 +25,34 @@ using APSInt = llvm::APSInt;
 namespace clang {
 namespace interp {
 
+static bool refersToUnion(const Expr *E) {
+  for (;;) {
+    if (const auto *ME = dyn_cast<MemberExpr>(E)) {
+      if (const auto *FD = dyn_cast<FieldDecl>(ME->getMemberDecl());
+          FD && FD->getParent()->isUnion())
+        return true;
+      E = ME->getBase();
+      continue;
+    }
+
+    if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) {
+      E = ASE->getBase()->IgnoreImplicit();
+      continue;
+    }
+
+    if (const auto *ICE = dyn_cast<ImplicitCastExpr>(E);
+        ICE && (ICE->getCastKind() == CK_NoOp ||
+                ICE->getCastKind() == CK_DerivedToBase ||
+                ICE->getCastKind() == CK_UncheckedDerivedToBase)) {
+      E = ICE->getSubExpr();
+      continue;
+    }
+
+    break;
+  }
+  return false;
+}
+
 static std::optional<bool> getBoolValue(const Expr *E) {
   if (const auto *CE = dyn_cast_if_present<ConstantExpr>(E);
       CE && CE->hasAPValueResult() &&
@@ -880,22 +908,11 @@ bool Compiler<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
       return this->VisitPointerArithBinOp(BO);
   }
 
-  // Assignments require us to evalute the RHS first.
-  if (BO->getOpcode() == BO_Assign) {
-
-    if (!visit(RHS) || !visit(LHS))
-      return false;
-
-    // We don't support assignments in C.
-    if (!Ctx.getLangOpts().CPlusPlus && !this->emitInvalid(BO))
-      return false;
+  if (BO->getOpcode() == BO_Assign)
+    return this->visitAssignment(LHS, RHS, BO);
 
-    if (!this->emitFlip(*LT, *RT, BO))
-      return false;
-  } else {
-    if (!visit(LHS) || !visit(RHS))
-      return false;
-  }
+  if (!visit(LHS) || !visit(RHS))
+    return false;
 
   // For languages such as C, cast the result of one
   // of our comparision opcodes to T (which is usually int).
@@ -946,22 +963,6 @@ bool Compiler<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
     if (BO->getType()->isFloatingType())
       return Discard(this->emitDivf(getFPOptions(BO), BO));
     return Discard(this->emitDiv(*T, BO));
-  case BO_Assign:
-    if (DiscardResult)
-      return LHS->refersToBitField() ? this->emitStoreBitFieldPop(*T, BO)
-                                     : this->emitStorePop(*T, BO);
-    if (LHS->refersToBitField()) {
-      if (!this->emitStoreBitField(*T, BO))
-        return false;
-    } else {
-      if (!this->emitStore(*T, BO))
-        return false;
-    }
-    // Assignments aren't necessarily lvalues in C.
-    // Load from them in that case.
-    if (!BO->isLValue())
-      return this->emitLoadPop(*T, BO);
-    return true;
   case BO_And:
     return Discard(this->emitBitAnd(*T, BO));
   case BO_Or:
@@ -1790,19 +1791,26 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
       return this->delegate(Inits[0]);
 
     auto initPrimitiveField = [=](const Record::Field *FieldToInit,
-                                  const Expr *Init, PrimType T) -> bool {
+                                  const Expr *Init, PrimType T,
+                                  bool Activate = false) -> bool {
       InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(Init));
       InitLinkScope<Emitter> ILS(this, InitLink::Field(FieldToInit->Offset));
       if (!this->visit(Init))
         return false;
 
-      if (FieldToInit->isBitField())
+      bool BitField = FieldToInit->isBitField();
+      if (BitField && Activate)
+        return this->emitInitBitFieldActivate(T, FieldToInit, E);
+      if (BitField)
         return this->emitInitBitField(T, FieldToInit, E);
+      if (Activate)
+        return this->emitInitFieldActivate(T, FieldToInit->Offset, E);
       return this->emitInitField(T, FieldToInit->Offset, E);
     };
 
     auto initCompositeField = [=](const Record::Field *FieldToInit,
-                                  const Expr *Init) -> bool {
+                                  const Expr *Init,
+                                  bool Activate = false) -> bool {
       InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(Init));
       InitLinkScope<Emitter> ILS(this, InitLink::Field(FieldToInit->Offset));
 
@@ -1810,6 +1818,10 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
       // on the stack and recurse into visitInitializer().
       if (!this->emitGetPtrField(FieldToInit->Offset, Init))
         return false;
+
+      if (Activate && !this->emitActivate(E))
+        return false;
+
       if (!this->visitInitializer(Init))
         return false;
       return this->emitPopPtr(E);
@@ -1829,10 +1841,10 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
 
         const Record::Field *FieldToInit = R->getField(FToInit);
         if (std::optional<PrimType> T = classify(Init)) {
-          if (!initPrimitiveField(FieldToInit, Init, *T))
+          if (!initPrimitiveField(FieldToInit, Init, *T, /*Activate=*/true))
             return false;
         } else {
-          if (!initCompositeField(FieldToInit, Init))
+          if (!initCompositeField(FieldToInit, Init, /*Activate=*/true))
             return false;
         }
       }
@@ -2023,7 +2035,8 @@ bool Compiler<Emitter>::visitArrayElemInit(unsigned ElemIndex, const Expr *Init,
 
 template <class Emitter>
 bool Compiler<Emitter>::visitCallArgs(ArrayRef<const Expr *> Args,
-                                      const FunctionDecl *FuncDecl) {
+                                      const FunctionDecl *FuncDecl,
+                                      bool Activate) {
   assert(VarScope->getKind() == ScopeKind::Call);
   llvm::BitVector NonNullArgs = collectNonNullArgs(FuncDecl, Args);
 
@@ -2046,6 +2059,11 @@ bool Compiler<Emitter>::visitCallArgs(ArrayRef<const Expr *> Args,
         return false;
     }
 
+    if (ArgIndex == 1 && Activate) {
+      if (!this->emitActivate(Arg))
+        return false;
+    }
+
     if (FuncDecl && NonNullArgs[ArgIndex]) {
       PrimType ArgT = classify(Arg).value_or(PT_Ptr);
       if (ArgT == PT_Ptr) {
@@ -4227,10 +4245,13 @@ bool Compiler<Emitter>::visitZeroRecordInitializer(const Record *R,
       PrimType T = classifyPrim(D->getType());
       if (!this->visitZeroInitializer(T, QT, E))
         return false;
+      if (R->isUnion()) {
+        if (!this->emitInitFieldActivate(T, Field.Offset, E))
+          return false;
+        break;
+      }
       if (!this->emitInitField(T, Field.Offset, E))
         return false;
-      if (R->isUnion())
-        break;
       continue;
     }
 
@@ -4256,13 +4277,15 @@ bool Compiler<Emitter>::visitZeroRecordInitializer(const Record *R,
     } else
       return false;
 
-    if (!this->emitFinishInitPop(E))
-      return false;
-
     // C++11 [dcl.init]p5: If T is a (possibly cv-qualified) union type, the
     // object's first non-static named data member is zero-initialized
-    if (R->isUnion())
+    if (R->isUnion()) {
+      if (!this->emitFinishInitActivatePop(E))
+        return false;
       break;
+    }
+    if (!this->emitFinishInitPop(E))
+      return false;
   }
 
   for (const Record::Base &B : R->bases()) {
@@ -4325,6 +4348,59 @@ bool Compiler<Emitter>::visitZeroArrayInitializer(QualType T, const Expr *E) {
   return false;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::visitAssignment(const Expr *LHS, const Expr *RHS,
+                                        const Expr *E) {
+  if (!classify(E->getType()))
+    return false;
+
+  if (!this->visit(RHS))
+    return false;
+  if (!this->visit(LHS))
+    return false;
+
+  // We don't support assignments in C.
+  if (!Ctx.getLangOpts().CPlusPlus && !this->emitInvalid(E))
+    return false;
+
+  PrimType RHT = classifyPrim(RHS);
+  bool Activates = refersToUnion(LHS);
+  bool BitField = LHS->refersToBitField();
+
+  if (!this->emitFlip(PT_Ptr, RHT, E))
+    return false;
+
+  if (DiscardResult) {
+    if (BitField && Activates)
+      return this->emitStoreBitFieldActivatePop(RHT, E);
+    if (BitField)
+      return this->emitStoreBitFieldPop(RHT, E);
+    if (Activates)
+      return this->emitStoreActivatePop(RHT, E);
+    // Otherwise, regular non-activating store.
+    return this->emitStorePop(RHT, E);
+  }
+
+  auto maybeLoad = [&](bool Result) -> bool {
+    if (!Result)
+      return false;
+    // Assignments aren't necessarily lvalues in C.
+    // Load from them in that case.
+    if (!E->isLValue())
+      return this->emitLoadPop(RHT, E);
+    return true;
+  };
+
+  if (BitField && Activates)
+    return maybeLoad(this->emitStoreBitFieldActivate(RHT, E));
+  if (BitField)
+    return maybeLoad(this->emitStoreBitField(RHT, E));
+  if (Activates)
+    return maybeLoad(this->emitStoreActivate(RHT, E));
+  // Otherwise, regular non-activating store.
+  return maybeLoad(this->emitStore(RHT, E));
+}
+
 template <class Emitter>
 template <typename T>
 bool Compiler<Emitter>::emitConst(T Value, PrimType Ty, const Expr *E) {
@@ -5067,7 +5143,7 @@ bool Compiler<Emitter>::VisitCallExpr(const CallExpr *E) {
       return false;
   }
 
-  if (!this->visitCallArgs(Args, FuncDecl))
+  if (!this->visitCallArgs(Args, FuncDecl, IsAssignmentOperatorCall))
     return false;
 
   // Undo the argument reversal we did earlier.
@@ -5851,7 +5927,8 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
   assert(!ReturnType);
 
   auto emitFieldInitializer = [&](const Record::Field *F, unsigned FieldOffset,
-                                  const Expr *InitExpr) -> bool {
+                                  const Expr *InitExpr,
+                                  bool Activate = false) -> bool {
     // We don't know what to do with these, so just return false.
     if (InitExpr->getType().isNull())
       return false;
@@ -5860,8 +5937,13 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
       if (!this->visit(InitExpr))
         return false;
 
-      if (F->isBitField())
+      bool BitField = F->isBitField();
+      if (BitField && Activate)
+        return this->emitInitThisBitFieldActivate(*T, F, FieldOffset, InitExpr);
+      if (BitField)
         return this->emitInitThisBitField(*T, F, FieldOffset, InitExpr);
+      if (Activate)
+        return this->emitInitThisFieldActivate(*T, FieldOffset, InitExpr);
       return this->emitInitThisField(*T, FieldOffset, InitExpr);
     }
     // Non-primitive case. Get a pointer to the field-to-initialize
@@ -5870,6 +5952,9 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
     if (!this->emitGetPtrThisField(FieldOffset, InitExpr))
       return false;
 
+    if (Activate && !this->emitActivate(InitExpr))
+      return false;
+
     if (!this->visitInitializer(InitExpr))
       return false;
 
@@ -5880,8 +5965,9 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
   const Record *R = this->getRecord(RD);
   if (!R)
     return false;
+  bool IsUnion = R->isUnion();
 
-  if (R->isUnion() && Ctor->isCopyOrMoveConstructor()) {
+  if (IsUnion && Ctor->isCopyOrMoveConstructor()) {
     if (R->getNumFields() == 0)
       return this->emitRetVoid(Ctor);
     // union copy and move ctors are special.
@@ -5908,7 +5994,7 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
     if (const FieldDecl *Member = Init->getMember()) {
       const Record::Field *F = R->getField(Member);
 
-      if (!emitFieldInitializer(F, F->Offset, InitExpr))
+      if (!emitFieldInitializer(F, F->Offset, InitExpr, IsUnion))
         return false;
     } else if (const Type *Base = Init->getBaseClass()) {
       const auto *BaseDecl = Base->getAsCXXRecordDecl();
@@ -5928,11 +6014,15 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
           return false;
       }
 
+      if (IsUnion && !this->emitActivate(InitExpr))
+        return false;
+
       if (!this->visitInitializer(InitExpr))
         return false;
       if (!this->emitFinishInitPop(InitExpr))
         return false;
     } else if (const IndirectFieldDecl *IFD = Init->getIndirectMember()) {
+
       assert(IFD->getChainingSize() >= 2);
 
       unsigned NestedFieldOffset = 0;
@@ -5944,12 +6034,14 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
 
         NestedField = FieldRecord->getField(FD);
         assert(NestedField);
+        IsUnion = IsUnion || FieldRecord->isUnion();
 
         NestedFieldOffset += NestedField->Offset;
       }
       assert(NestedField);
 
-      if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr))
+      if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr,
+                                IsUnion))
         return false;
 
       // Mark all chain links as initialized.
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 05ba7460b343b..debee6726853b 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -307,7 +307,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
                      const Expr *E);
   bool visitArrayElemInit(unsigned ElemIndex, const Expr *Init,
                           std::optional<PrimType> InitT);
-  bool visitCallArgs(ArrayRef<const Expr *> Args, const FunctionDecl *FuncDecl);
+  bool visitCallArgs(ArrayRef<const Expr *> Args, const FunctionDecl *FuncDecl,
+                     bool Activate);
 
   /// Creates a local primitive value.
   unsigned allocateLocalPrimitive(DeclTy &&Decl, PrimType Ty, bool IsConst,
@@ -342,6 +343,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool visitZeroInitializer(PrimType T, QualType QT, const Expr *E);
   bool visitZeroRecordInitializer(const Record *R, const Expr *E);
   bool visitZeroArrayInitializer(QualType T, const Expr *E);
+  bool visitAssignment(const Expr *LHS, const Expr *RHS, const Expr *E);
 
   /// Emits an APSInt constant.
   bool emitConst(const llvm::APSInt &Value, PrimType Ty, const Expr *E);
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 457de2bed37d6..e8b519478c026 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -326,7 +326,6 @@ bool CheckActive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
     return true;
 
   assert(Ptr.inUnion());
-  assert(Ptr.isField() && Ptr.getField());
 
   Pointer U = Ptr.getBase();
   Pointer C = Ptr;
@@ -575,7 +574,7 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 
   // The This pointer is writable in constructors and destructors,
   // even if isConst() returns true.
-  if (llvm::find(S.InitializingBlocks, Ptr.block()))
+  if (llvm::is_contained(S.InitializingBlocks, Ptr.block()))
     return true;
 
   const QualType Ty = Ptr.getType();
@@ -805,6 +804,8 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
     return false;
   if (!CheckRange(S, OpPC, Ptr, AK_Assign))
     return false;
+  if (!CheckActive(S, OpPC, Ptr, AK_Assign))
+    return false;
   if (!CheckGlobal(S, OpPC, Ptr))
     return false;
   if (!CheckConst(S, OpPC, Ptr))
@@ -814,7 +815,7 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   return true;
 }
 
-bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
+static bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_MemberCall))
     return false;
   if (!Ptr.isDummy()) {
@@ -936,7 +937,7 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
   return false;
 }
 
-bool CheckCallDepth(InterpState &S, CodePtr OpPC) {
+static bool CheckCallDepth(InterpState &S, CodePtr OpPC) {
   if ((S.Current->getDepth() + 1) > S.getLangOpts().ConstexprCallDepth) {
     S.FFDiag(S.Current->getSource(OpPC),
              diag::note_constexpr_depth_limit_exceeded)
@@ -1091,8 +1092,8 @@ bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return false;
 }
 
-bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
-                      const CallExpr *CE, unsigned ArgSize) {
+static bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
+                             const CallExpr *CE, unsigned ArgSize) {
   auto Args = ArrayRef(CE->getArgs(), CE->getNumArgs());
   auto NonNullArgs = collectNonNullArgs(F->getDecl(), Args);
   unsigned Offset = 0;
@@ -1500,7 +1501,6 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
       if (!CheckInvoke(S, OpPC, ThisPtr))
         return cleanup();
       if (!Func->isConstructor() && !Func->isDestructor() &&
-          !Func->isCopyOrMoveOperator() &&
           !CheckActive(S, OpPC, ThisPtr, AK_MemberCall))
         return false;
     }
@@ -1773,6 +1773,9 @@ bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E,
                           std::optional<uint64_t> ArraySize) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
+  if (Ptr.inUnion() && Ptr.getBase().getRecord()->isUnion())
+    Ptr.activate();
+
   // Similar to CheckStore(), but with the additional CheckTemporary() call and
   // the AccessKinds are different.
   if (!CheckTemporary(S, OpPC, Ptr, AK_Construct))
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 9d17f96c97c85..ce0ebdd8321b7 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -98,26 +98,12 @@ bool CheckGlobalInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 /// Checks if a value can be stored in a block.
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-/// Checks if a method can be invoked on an object.
-bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
-
 /// Checks if a value can be initialized.
 bool CheckInit(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-/// Checks if a method can be called.
-bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F);
-
-/// Checks if calling the currently active function would exceed
-/// the allowed call depth.
-bool CheckCallDepth(InterpState &S, CodePtr OpPC);
-
 /// Checks the 'this' pointer.
 bool CheckThis(InterpState &S, CodePtr OpPC, const Pointer &This);
 
-/// Checks if all the arguments annotated as 'nonnull' are in fact not null.
-bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
-                      const CallExpr *CE, unsigned ArgSize);
-
 /// Checks if dynamic memory allocation is available in the current
 /// language mode.
 bool CheckDynamicMemoryAllocation(InterpState &S, CodePtr OpPC);
@@ -1592,6 +1578,21 @@ bool InitThisField(InterpState &S, CodePtr OpPC, uint32_t I) {
   if (!CheckThis(S, OpPC, This))
     return false;
   const Pointer &Field = This.atField(I);
+  assert(Field.canBeInitialized());
+  Field.deref<T>() = S.Stk.pop<T>();
+  Field.initialize();
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool InitThisFieldActivate(InterpState &S, CodePtr OpPC, uint32_t I) {
+  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() == 0)
+    return false;
+  const Pointer &This = S.Current->getThis();
+  if (!CheckThis(S, OpPC, This))
+    return false;
+  const Pointer &Field = This.atField(I);
+  assert(Field.canBeInitialized());
   Field.deref<T>() = S.Stk.pop<T>();
   Field.activate();
   Field.initialize();
@@ -1610,17 +1611,48 @@ bool InitThisBitField(InterpState &S, CodePtr OpPC, const Record::Field *F,
   if (!CheckThis(S, OpPC, This))
     return false;
   const Pointer &Field = This.atField(FieldOffset);
+  assert(Field.canBeInitialized());
   const auto &Value = S.Stk.pop<T>();
   Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
   Field.initialize();
   return true;
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool InitThisBitFieldActivate(InterpState &S, CodePtr OpPC,
+                              const Record::Field *F, uint32_t FieldOffset) {
+  assert(F->isBitField());
+  if (S.checkingPotentialConstantExpression() && S.Current->getDepth() == 0)
+    return false;
+  const Pointer &This = S.Current->getThis();
+  if (!CheckThis(S, OpPC, This))
+    return false;
+  const Pointer &Field = This.atField(FieldOffset);
+  assert(Field.canBeInitialized());
+  const auto &Value = S.Stk.pop<T>();
+  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+  Field.initialize();
+  Field.activate();
+  return true;
+}
+
 /// 1) Pops the value from the stack
 /// 2) Peeks a pointer from the stack
 /// 3) Pushes the value to field I of the pointer on the stack
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitField(InterpState &S, CodePtr OpPC, uint32_t I) {
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
+    return false;
+  const Pointer &Field = Ptr.atField(I);
+  Field.deref<T>() = Value;
+  Field.initialize();
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool InitFieldActivate(InterpState &S, CodePtr OpPC, uint32_t I) {
   const T &Value = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckRange(S, OpPC, Ptr, CSK_Field))
@@ -1638,6 +1670,32 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
 
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Value.bitWidth());
+    if (T::isSigned())
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .sextOrTrunc(Value.bitWidth()));
+    else
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .zextOrTrunc(Value.bitWidth()));
+
+    Field.deref<T>() = Result;
+  } else {
+    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+  }
+  Field.initialize();
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool InitBitFieldActivate(InterpState &S, CodePtr OpPC,
+                          const Record::Field *F) {
+  assert(F->isBitField());
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
+
   if constexpr (needsAlloc<T>()) {
     T Result = S.allocAP<T>(Value.bitWidth());
     if (T::isSigned())
@@ -1695,32 +1753,6 @@ inline bool GetPtrThisField(InterpState &S, CodePtr OpPC, uint32_t Off) {
   return true;
 }
 
-inline bool GetPtrActiveField(InterpState &S, CodePtr OpPC, uint32_t Off) {
-  const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckNull(S, OpPC, Ptr, CSK_Field))
-    return false;
-  if (!CheckRange(S, OpPC, Ptr, CSK_Field))
-    return false;
-  Pointer Field = Ptr.atField(Off);
-  Ptr.deactivate();
-  Field.activate();
-  S.Stk.push<Pointer>(std::move(Field));
-  return true;
-}
-
-inline bool GetPtrActiveThisField(InterpState &S, CodePtr OpPC, uint32_t Off) {
-  if (S.checkingPotentialConstantExpression())
-    return false;
-  const Pointer &This = S.Current->getThis();
-  if (!CheckThis(S, OpPC, This))
-    return false;
-  Pointer Field = This.atField(Off);
-  This.deactivate();
-  Field.activate();
-  S.Stk.push<Pointer>(std::move(Field));
-  return true;
-}
-
 inline bool GetPtrDerivedPop(InterpState &S, CodePtr OpPC, uint32_t Off,
                              bool NullOK, const Type *TargetType) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
@@ -1816,6 +1848,20 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
 
 inline bool FinishInitPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
+  return true;
+}
+
+inline bool FinishInit(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
+  return true;
+}
+
+inline bool FinishInitActivate(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (Ptr.canBeInitialized()) {
     Ptr.initialize();
     Ptr.activate();
@@ -1823,8 +1869,8 @@ inline bool FinishInitPop(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
-inline bool FinishInit(InterpState &S, CodePtr OpPC) {
-  const Pointer &Ptr = S.Stk.peek<Pointer>();
+inline bool FinishInitActivatePop(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (Ptr.canBeInitialized()) {
     Ptr.initialize();
     Ptr.activate();
@@ -1902,10 +1948,8 @@ bool Store(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckStore(S, OpPC, Ptr))
     return false;
-  if (Ptr.canBeInitialized()) {
+  if (Ptr.canBeInitialized())
     Ptr.initialize();
-    Ptr.activate();
-  }
   Ptr.deref<T>() = Value;
   return true;
 }
@@ -1916,10 +1960,46 @@ bool StorePop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckStore(S, OpPC, Ptr))
     return false;
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
+  Ptr.deref<T>() = Value;
+  return true;
+}
+
+static inline bool Activate(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (Ptr.canBeInitialized())
+    Ptr.activate();
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool StoreActivate(InterpState &S, CodePtr OpPC) {
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+
+  if (Ptr.canBeInitialized()) {
+    Ptr.initialize();
+    Ptr.activate();
+  }
+
+  if (!CheckStore(S, OpPC, Ptr))
+    return false;
+  Ptr.deref<T>() = Value;
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool StoreActivatePop(InterpState &S, CodePtr OpPC) {
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+
   if (Ptr.canBeInitialized()) {
     Ptr.initialize();
     Ptr.activate();
   }
+  if (!CheckStore(S, OpPC, Ptr))
+    return false;
   Ptr.deref<T>() = Value;
   return true;
 }
@@ -1930,10 +2010,8 @@ bool StoreBitField(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckStore(S, OpPC, Ptr))
     return false;
-  if (Ptr.canBeInitialized()) {
+  if (Ptr.canBeInitialized())
     Ptr.initialize();
-    Ptr.activate();
-  }
   if (const auto *FD = Ptr.getField())
     Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue());
   else
@@ -1947,10 +2025,43 @@ bool StoreBitFieldPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckStore(S, OpPC, Ptr))
     return false;
+  if (Ptr.canBeInitialized())
+    Ptr.initialize();
+  if (const auto *FD = Ptr.getField())
+    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue());
+  else
+    Ptr.deref<T>() = Value;
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool StoreBitFieldActivate(InterpState &S, CodePtr OpPC) {
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (Ptr.canBeInitialized()) {
+    Ptr.initialize();
+    Ptr.activate();
+  }
+  if (!CheckStore(S, OpPC, Ptr))
+    return false;
+  if (const auto *FD = Ptr.getField())
+    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue());
+  else
+    Ptr.deref<T>() = Value;
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool StoreBitFieldActivatePop(InterpState &S, CodePtr OpPC) {
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+
   if (Ptr.canBeInitialized()) {
     Ptr.initialize();
     Ptr.activate();
   }
+  if (!CheckStore(S, OpPC, Ptr))
+    return false;
   if (const auto *FD = Ptr.getField())
     Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue());
   else
@@ -1964,7 +2075,6 @@ bool Init(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckInit(S, OpPC, Ptr))
     return false;
-  Ptr.activate();
   Ptr.initialize();
   new (&Ptr.deref<T>()) T(Value);
   return true;
@@ -1976,7 +2086,6 @@ bool InitPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckInit(S, OpPC, Ptr))
     return false;
-  Ptr.activate();
   Ptr.initialize();
   new (&Ptr.deref<T>()) T(Value);
   return true;
diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h
index 7798b6f886a85..51622238e275c 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.h
+++ b/clang/lib/AST/ByteCode/InterpBlock.h
@@ -14,11 +14,6 @@
 #define LLVM_CLANG_AST_INTERP_BLOCK_H
 
 #include "Descriptor.h"
-#include "clang/AST/ComparisonCategories.h"
-#include "clang/AST/Decl.h"
-#include "clang/AST/DeclCXX.h"
-#include "clang/AST/Expr.h"
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace clang {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index de0b97fd93c76..9ce1e380bff2c 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
-#include "Compiler.h"
 #include "EvalEmitter.h"
 #include "Interp.h"
 #include "InterpBuiltinBitCast.h"
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 57e01f7bd9da0..804853d29512e 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -315,10 +315,6 @@ def GetPtrGlobal : OffsetOpcode;
 // [Pointer] -> [Pointer]
 def GetPtrField : OffsetOpcode;
 def GetPtrFieldPop : OffsetOpcode;
-// [Pointer] -> [Pointer]
-def GetPtrActiveField : OffsetOpcode;
-// [] -> [Pointer]
-def GetPtrActiveThisField : OffsetOpcode;
 // [] -> [Pointer]
 def GetPtrThisField : OffsetOpcode;
 // [Pointer] -> [Pointer]
@@ -330,9 +326,10 @@ def GetMemberPtrBasePop : Opcode {
   let Args = [ArgSint32];
 }
 
-
 def FinishInitPop : Opcode;
-def FinishInit    : Opcode;
+def FinishInit : Opcode;
+def FinishInitActivate : Opcode;
+def FinishInitActivatePop : Opcode;
 def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
@@ -460,16 +457,24 @@ def SetThisField : AccessOpcode;
 
 // [Value] -> []
 def InitThisField : AccessOpcode;
+def InitThisFieldActivate : AccessOpcode;
 // [Value] -> []
 def InitThisBitField : Opcode {
   let Types = [AluTypeClass];
   let Args = [ArgRecordField, ArgUint32];
   let HasGroup = 1;
 }
+def InitThisBitFieldActivate : Opcode {
+  let Types = [AluTypeClass];
+  let Args = [ArgRecordField, ArgUint32];
+  let HasGroup = 1;
+}
 // [Pointer, Value] -> []
 def InitField : AccessOpcode;
+def InitFieldActivate : AccessOpcode;
 // [Pointer, Value] -> []
 def InitBitField : BitFieldOpcode;
+def InitBitFieldActivate : BitFieldOpcode;
 
 //===----------------------------------------------------------------------===//
 // Pointer access
@@ -495,15 +500,16 @@ class StoreBitFieldOpcode : Opcode {
   let HasGroup = 1;
 }
 
-// [Pointer, Value] -> [Pointer]
 def Store : StoreOpcode {}
-// [Pointer, Value] -> []
 def StorePop : StoreOpcode {}
-
-// [Pointer, Value] -> [Pointer]
+def StoreActivatePop : StoreOpcode {}
+def StoreActivate : StoreOpcode {}
 def StoreBitField : StoreBitFieldOpcode {}
-// [Pointer, Value] -> []
 def StoreBitFieldPop : StoreBitFieldOpcode {}
+def StoreBitFieldActivate : StoreBitFieldOpcode {}
+def StoreBitFieldActivatePop : StoreBitFieldOpcode {}
+
+def Activate : Opcode {}
 
 // [Pointer, Value] -> []
 def Init : StoreOpcode {}
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 159b4238a6a04..2f9ecf98e558e 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -502,8 +502,17 @@ void Pointer::activate() const {
   if (!getInlineDesc()->InUnion)
     return;
 
-  auto activate = [](Pointer &P) -> void {
+  std::function<void(Pointer &)> activate;
+  activate = [&activate](Pointer &P) -> void {
     P.getInlineDesc()->IsActive = true;
+    if (const Record *R = P.getRecord(); R && !R->isUnion()) {
+      for (const Record::Field &F : R->fields()) {
+        Pointer FieldPtr = P.atField(F.Offset);
+        if (!FieldPtr.getInlineDesc()->IsActive)
+          activate(FieldPtr);
+      }
+      // FIXME: Bases?
+    }
   };
 
   std::function<void(Pointer &)> deactivate;
diff --git a/clang/lib/AST/ByteCode/State.h b/clang/lib/AST/ByteCode/State.h
index 9a81fa6b7d220..6fc33222ac956 100644
--- a/clang/lib/AST/ByteCode/State.h
+++ b/clang/lib/AST/ByteCode/State.h
@@ -35,6 +35,7 @@ enum AccessKinds {
   AK_Construct,
   AK_Destroy,
   AK_IsWithinLifetime,
+  AK_Dereference
 };
 
 /// The order of this enum is important for diagnostics.
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index bd1b5950d30a6..83fcd87aec2f8 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4677,11 +4677,14 @@ void FieldDecl::setLazyInClassInitializer(LazyDeclStmtPtr NewInit) {
     Init = NewInit;
 }
 
+bool FieldDecl::hasConstantIntegerBitWidth() const {
+  const auto *CE = dyn_cast_if_present<ConstantExpr>(getBitWidth());
+  return CE && CE->getAPValueResult().isInt();
+}
+
 unsigned FieldDecl::getBitWidthValue() const {
   assert(isBitField() && "not a bitfield");
-  assert(isa<ConstantExpr>(getBitWidth()));
-  assert(cast<ConstantExpr>(getBitWidth())->hasAPValueResult());
-  assert(cast<ConstantExpr>(getBitWidth())->getAPValueResult().isInt());
+  assert(hasConstantIntegerBitWidth());
   return cast<ConstantExpr>(getBitWidth())
       ->getAPValueResult()
       .getInt()
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 4514965009793..673e3f73858c7 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -3211,6 +3211,12 @@ UsingDirectiveDecl *UsingDirectiveDecl::CreateDeserialized(ASTContext &C,
                                         SourceLocation(), nullptr, nullptr);
 }
 
+NamespaceDecl *NamespaceBaseDecl::getNamespace() {
+  if (auto *Alias = dyn_cast<NamespaceAliasDecl>(this))
+    return Alias->getNamespace();
+  return cast<NamespaceDecl>(this);
+}
+
 NamespaceDecl *UsingDirectiveDecl::getNominatedNamespace() {
   if (auto *NA = dyn_cast_or_null<NamespaceAliasDecl>(NominatedNamespace))
     return NA->getNamespace();
@@ -3221,7 +3227,7 @@ NamespaceDecl::NamespaceDecl(ASTContext &C, DeclContext *DC, bool Inline,
                              SourceLocation StartLoc, SourceLocation IdLoc,
                              IdentifierInfo *Id, NamespaceDecl *PrevDecl,
                              bool Nested)
-    : NamedDecl(Namespace, DC, IdLoc, Id), DeclContext(Namespace),
+    : NamespaceBaseDecl(Namespace, DC, IdLoc, Id), DeclContext(Namespace),
       redeclarable_base(C), LocStart(StartLoc) {
   setInline(Inline);
   setNested(Nested);
@@ -3268,13 +3274,11 @@ NamespaceAliasDecl *NamespaceAliasDecl::getMostRecentDeclImpl() {
   return getMostRecentDecl();
 }
 
-NamespaceAliasDecl *NamespaceAliasDecl::Create(ASTContext &C, DeclContext *DC,
-                                               SourceLocation UsingLoc,
-                                               SourceLocation AliasLoc,
-                                               IdentifierInfo *Alias,
-                                           NestedNameSpecifierLoc QualifierLoc,
-                                               SourceLocation IdentLoc,
-                                               NamedDecl *Namespace) {
+NamespaceAliasDecl *NamespaceAliasDecl::Create(
+    ASTContext &C, DeclContext *DC, SourceLocation UsingLoc,
+    SourceLocation AliasLoc, IdentifierInfo *Alias,
+    NestedNameSpecifierLoc QualifierLoc, SourceLocation IdentLoc,
+    NamespaceBaseDecl *Namespace) {
   // FIXME: Preserve the aliased namespace as written.
   if (auto *NS = dyn_cast_or_null<NamespaceDecl>(Namespace))
     Namespace = NS->getFirstDecl();
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 419dd5dbdc695..8797eaddd0e18 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1529,7 +1529,7 @@ CallStackFrame::~CallStackFrame() {
 
 static bool isRead(AccessKinds AK) {
   return AK == AK_Read || AK == AK_ReadObjectRepresentation ||
-         AK == AK_IsWithinLifetime;
+         AK == AK_IsWithinLifetime || AK == AK_Dereference;
 }
 
 static bool isModification(AccessKinds AK) {
@@ -1540,6 +1540,7 @@ static bool isModification(AccessKinds AK) {
   case AK_DynamicCast:
   case AK_TypeId:
   case AK_IsWithinLifetime:
+  case AK_Dereference:
     return false;
   case AK_Assign:
   case AK_Increment:
@@ -1558,15 +1559,16 @@ static bool isAnyAccess(AccessKinds AK) {
 /// Is this an access per the C++ definition?
 static bool isFormalAccess(AccessKinds AK) {
   return isAnyAccess(AK) && AK != AK_Construct && AK != AK_Destroy &&
-         AK != AK_IsWithinLifetime;
+         AK != AK_IsWithinLifetime && AK != AK_Dereference;
 }
 
-/// Is this kind of axcess valid on an indeterminate object value?
+/// Is this kind of access valid on an indeterminate object value?
 static bool isValidIndeterminateAccess(AccessKinds AK) {
   switch (AK) {
   case AK_Read:
   case AK_Increment:
   case AK_Decrement:
+  case AK_Dereference:
     // These need the object's value.
     return false;
 
@@ -1733,7 +1735,10 @@ namespace {
     bool checkNullPointerForFoldAccess(EvalInfo &Info, const Expr *E,
                                        AccessKinds AK) {
       return checkNullPointerDiagnosingWith([&Info, E, AK] {
-        Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
+        if (AK == AccessKinds::AK_Dereference)
+          Info.FFDiag(E, diag::note_constexpr_dereferencing_null);
+        else
+          Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
       });
     }
 
@@ -4305,7 +4310,10 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
   }
 
   if (!LVal.Base) {
-    Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
+    if (AK == AccessKinds::AK_Dereference)
+      Info.FFDiag(E, diag::note_constexpr_dereferencing_null);
+    else
+      Info.FFDiag(E, diag::note_constexpr_access_null) << AK;
     return CompleteObject();
   }
 
@@ -4407,8 +4415,9 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
       ConstexprVar = VD->isConstexpr();
 
     // Unless we're looking at a local variable or argument in a constexpr call,
-    // the variable we're reading must be const.
-    if (!Frame) {
+    // the variable we're reading must be const (unless we are binding to a
+    // reference).
+    if (AK != clang::AK_Dereference && !Frame) {
       if (IsAccess && isa<ParmVarDecl>(VD)) {
         // Access of a parameter that's not associated with a frame isn't going
         // to work out, but we can leave it to evaluateVarDeclInit to provide a
@@ -4441,7 +4450,8 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
         }
       } else if (!IsAccess) {
         return CompleteObject(LVal.getLValueBase(), nullptr, BaseType);
-      } else if (IsConstant && Info.checkingPotentialConstantExpression() &&
+      } else if ((IsConstant || BaseType->isReferenceType()) &&
+                 Info.checkingPotentialConstantExpression() &&
                  BaseType->isLiteralType(Info.Ctx) && !VD->hasDefinition()) {
         // This variable might end up being constexpr. Don't diagnose it yet.
       } else if (IsConstant) {
@@ -4472,15 +4482,21 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
       }
     }
 
-    if (!evaluateVarDeclInit(Info, E, VD, Frame, LVal.getLValueVersion(), BaseVal))
+    // When binding to a reference, the variable does not need to be constexpr
+    // or have constant initalization.
+    if (AK != clang::AK_Dereference &&
+        !evaluateVarDeclInit(Info, E, VD, Frame, LVal.getLValueVersion(),
+                             BaseVal))
       return CompleteObject();
     // If evaluateVarDeclInit sees a constexpr-unknown variable, it returns
     // a null BaseVal. Any constexpr-unknown variable seen here is an error:
     // we can't access a constexpr-unknown object.
-    if (!BaseVal) {
-      Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
-          << AK << VD;
-      Info.Note(VD->getLocation(), diag::note_declared_at);
+    if (AK != clang::AK_Dereference && !BaseVal) {
+      if (!Info.checkingPotentialConstantExpression()) {
+        Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
+            << AK << VD;
+        Info.Note(VD->getLocation(), diag::note_declared_at);
+      }
       return CompleteObject();
     }
   } else if (DynamicAllocLValue DA = LVal.Base.dyn_cast<DynamicAllocLValue>()) {
@@ -4491,7 +4507,10 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
     }
     return CompleteObject(LVal.Base, &(*Alloc)->Value,
                           LVal.Base.getDynamicAllocType());
-  } else {
+  }
+  // When binding to a reference, the variable does not need to be
+  // within its lifetime.
+  else if (AK != clang::AK_Dereference) {
     const Expr *Base = LVal.Base.dyn_cast<const Expr*>();
 
     if (!Frame) {
@@ -4572,7 +4591,7 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
         NoteLValueLocation(Info, LVal.Base);
         return CompleteObject();
       }
-    } else {
+    } else if (AK != clang::AK_Dereference) {
       BaseVal = Frame->getTemporary(Base, LVal.Base.getVersion());
       assert(BaseVal && "missing value for temporary");
     }
@@ -5200,6 +5219,29 @@ enum EvalStmtResult {
   ESR_CaseNotFound
 };
 }
+/// Evaluates the initializer of a reference.
+static bool EvaluateInitForDeclOfReferenceType(EvalInfo &Info,
+                                               const ValueDecl *D,
+                                               const Expr *Init, LValue &Result,
+                                               APValue &Val) {
+  assert(Init->isGLValue() && D->getType()->isReferenceType());
+  // A reference is an lvalue.
+  if (!EvaluateLValue(Init, Result, Info))
+    return false;
+  // [C++26][decl.ref]
+  // The object designated by such a glvalue can be outside its lifetime
+  // Because a null pointer value or a pointer past the end of an object
+  // does not point to an object, a reference in a well-defined program cannot
+  // refer to such things;
+  if (!Result.Designator.Invalid && Result.Designator.isOnePastTheEnd()) {
+    Info.FFDiag(Init, diag::note_constexpr_access_past_end) << AK_Dereference;
+    return false;
+  }
+
+  // Save the result.
+  Result.moveInto(Val);
+  return true;
+}
 
 static bool EvaluateVarDecl(EvalInfo &Info, const VarDecl *VD) {
   if (VD->isInvalidDecl())
@@ -5221,7 +5263,11 @@ static bool EvaluateVarDecl(EvalInfo &Info, const VarDecl *VD) {
   if (InitE->isValueDependent())
     return false;
 
-  if (!EvaluateInPlace(Val, Info, Result, InitE)) {
+  // For references to objects, check they do not designate a one-past-the-end
+  // object.
+  if (VD->getType()->isReferenceType()) {
+    return EvaluateInitForDeclOfReferenceType(Info, VD, InitE, Result, Val);
+  } else if (!EvaluateInPlace(Val, Info, Result, InitE)) {
     // Wipe out any partially-computed value, to allow tracking that this
     // evaluation failed.
     Val = APValue();
@@ -6851,9 +6897,18 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This,
       ThisOverrideRAII ThisOverride(*Info.CurrentCall, &SubobjectParent,
                                     isa<CXXDefaultInitExpr>(Init));
       FullExpressionRAII InitScope(Info);
-      if (!EvaluateInPlace(*Value, Info, Subobject, Init) ||
-          (FD && FD->isBitField() &&
-           !truncateBitfieldValue(Info, Init, *Value, FD))) {
+      if (FD && FD->getType()->isReferenceType() &&
+          !FD->getType()->isFunctionReferenceType()) {
+        LValue Result;
+        if (!EvaluateInitForDeclOfReferenceType(Info, FD, Init, Result,
+                                                *Value)) {
+          if (!Info.noteFailure())
+            return false;
+          Success = false;
+        }
+      } else if (!EvaluateInPlace(*Value, Info, Subobject, Init) ||
+                 (FD && FD->isBitField() &&
+                  !truncateBitfieldValue(Info, Init, *Value, FD))) {
         // If we're checking for a potential constant expression, evaluate all
         // initializers even if some of them fail.
         if (!Info.noteFailure())
@@ -9287,7 +9342,13 @@ bool LValueExprEvaluator::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
 }
 
 bool LValueExprEvaluator::VisitUnaryDeref(const UnaryOperator *E) {
-  return evaluatePointer(E->getSubExpr(), Result);
+  bool Success = evaluatePointer(E->getSubExpr(), Result);
+  // [C++26][expr.unary.op]
+  // If the operand points to an object or function, the result
+  // denotes that object or function; otherwise, the behavior is undefined.
+  return Success &&
+         (!E->getType().getNonReferenceType()->isObjectType() ||
+          findCompleteObject(Info, E, AK_Dereference, Result, E->getType()));
 }
 
 bool LValueExprEvaluator::VisitUnaryReal(const UnaryOperator *E) {
@@ -10906,9 +10967,17 @@ bool RecordExprEvaluator::VisitCXXParenListOrInitListExpr(
                                   isa<CXXDefaultInitExpr>(Init));
 
     APValue &FieldVal = Result.getStructField(Field->getFieldIndex());
-    if (!EvaluateInPlace(FieldVal, Info, Subobject, Init) ||
-        (Field->isBitField() && !truncateBitfieldValue(Info, Init,
-                                                       FieldVal, Field))) {
+    if (Field->getType()->isReferenceType()) {
+      LValue Result;
+      if (!EvaluateInitForDeclOfReferenceType(Info, Field, Init, Result,
+                                              FieldVal)) {
+        if (!Info.noteFailure())
+          return false;
+        Success = false;
+      }
+    } else if (!EvaluateInPlace(FieldVal, Info, Subobject, Init) ||
+               (Field->isBitField() &&
+                !truncateBitfieldValue(Info, Init, FieldVal, Field))) {
       if (!Info.noteFailure())
         return false;
       Success = false;
@@ -14478,12 +14547,6 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     if (!EvaluatePointer(E->getRHS(), RHSValue, Info) || !LHSOK)
       return false;
 
-    // If we have Unknown pointers we should fail if they are not global values.
-    if (!(IsGlobalLValue(LHSValue.getLValueBase()) &&
-          IsGlobalLValue(RHSValue.getLValueBase())) &&
-        (LHSValue.AllowConstexprUnknown || RHSValue.AllowConstexprUnknown))
-      return false;
-
     // Reject differing bases from the normal codepath; we special-case
     // comparisons to null.
     if (!HasSameBase(LHSValue, RHSValue)) {
@@ -14545,6 +14608,10 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
           (LHSValue.Base && isZeroSized(RHSValue)))
         return DiagComparison(
             diag::note_constexpr_pointer_comparison_zero_sized);
+      if (LHSValue.AllowConstexprUnknown || RHSValue.AllowConstexprUnknown)
+        return DiagComparison(
+            diag::note_constexpr_pointer_comparison_unspecified);
+      // FIXME: Verify both variables are live.
       return Success(CmpResult::Unequal, E);
     }
 
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 0520987ce6b3a..6d082b31a9caa 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -1384,14 +1384,6 @@ void CXXNameMangler::mangleUnresolvedPrefix(NestedNameSpecifier *qualifier,
       Out << "sr";
     mangleSourceNameWithAbiTags(qualifier->getAsNamespace());
     break;
-  case NestedNameSpecifier::NamespaceAlias:
-    if (qualifier->getPrefix())
-      mangleUnresolvedPrefix(qualifier->getPrefix(),
-                             /*recursive*/ true);
-    else
-      Out << "sr";
-    mangleSourceNameWithAbiTags(qualifier->getAsNamespaceAlias());
-    break;
 
   case NestedNameSpecifier::TypeSpec: {
     const Type *type = qualifier->getAsType();
@@ -2185,11 +2177,7 @@ void CXXNameMangler::manglePrefix(NestedNameSpecifier *qualifier) {
     llvm_unreachable("Can't mangle __super specifier");
 
   case NestedNameSpecifier::Namespace:
-    mangleName(qualifier->getAsNamespace());
-    return;
-
-  case NestedNameSpecifier::NamespaceAlias:
-    mangleName(qualifier->getAsNamespaceAlias()->getNamespace());
+    mangleName(qualifier->getAsNamespace()->getNamespace());
     return;
 
   case NestedNameSpecifier::TypeSpec:
diff --git a/clang/lib/AST/NestedNameSpecifier.cpp b/clang/lib/AST/NestedNameSpecifier.cpp
index db1ad89565189..56f74b92412d2 100644
--- a/clang/lib/AST/NestedNameSpecifier.cpp
+++ b/clang/lib/AST/NestedNameSpecifier.cpp
@@ -66,10 +66,9 @@ NestedNameSpecifier *NestedNameSpecifier::Create(const ASTContext &Context,
   return FindOrInsert(Context, Mockup);
 }
 
-NestedNameSpecifier *
-NestedNameSpecifier::Create(const ASTContext &Context,
-                            NestedNameSpecifier *Prefix,
-                            const NamespaceDecl *NS) {
+NestedNameSpecifier *NestedNameSpecifier::Create(const ASTContext &Context,
+                                                 NestedNameSpecifier *Prefix,
+                                                 const NamespaceBaseDecl *NS) {
   assert(NS && "Namespace cannot be NULL");
   assert((!Prefix ||
           (Prefix->getAsType() == nullptr &&
@@ -78,23 +77,7 @@ NestedNameSpecifier::Create(const ASTContext &Context,
   NestedNameSpecifier Mockup;
   Mockup.Prefix.setPointer(Prefix);
   Mockup.Prefix.setInt(StoredDecl);
-  Mockup.Specifier = const_cast<NamespaceDecl *>(NS);
-  return FindOrInsert(Context, Mockup);
-}
-
-NestedNameSpecifier *
-NestedNameSpecifier::Create(const ASTContext &Context,
-                            NestedNameSpecifier *Prefix,
-                            const NamespaceAliasDecl *Alias) {
-  assert(Alias && "Namespace alias cannot be NULL");
-  assert((!Prefix ||
-          (Prefix->getAsType() == nullptr &&
-           Prefix->getAsIdentifier() == nullptr)) &&
-         "Broken nested name specifier");
-  NestedNameSpecifier Mockup;
-  Mockup.Prefix.setPointer(Prefix);
-  Mockup.Prefix.setInt(StoredDecl);
-  Mockup.Specifier = const_cast<NamespaceAliasDecl *>(Alias);
+  Mockup.Specifier = const_cast<NamespaceBaseDecl *>(NS);
   return FindOrInsert(Context, Mockup);
 }
 
@@ -147,9 +130,7 @@ NestedNameSpecifier::SpecifierKind NestedNameSpecifier::getKind() const {
 
   case StoredDecl: {
     NamedDecl *ND = static_cast<NamedDecl *>(Specifier);
-    if (isa<CXXRecordDecl>(ND))
-      return Super;
-    return isa<NamespaceDecl>(ND) ? Namespace : NamespaceAlias;
+    return isa<CXXRecordDecl>(ND) ? Super : Namespace;
   }
 
   case StoredTypeSpec:
@@ -159,18 +140,11 @@ NestedNameSpecifier::SpecifierKind NestedNameSpecifier::getKind() const {
   llvm_unreachable("Invalid NNS Kind!");
 }
 
-/// Retrieve the namespace stored in this nested name specifier.
-NamespaceDecl *NestedNameSpecifier::getAsNamespace() const {
+/// Retrieve the namespace or namespace alias stored in this nested name
+/// specifier.
+NamespaceBaseDecl *NestedNameSpecifier::getAsNamespace() const {
   if (Prefix.getInt() == StoredDecl)
-    return dyn_cast<NamespaceDecl>(static_cast<NamedDecl *>(Specifier));
-
-  return nullptr;
-}
-
-/// Retrieve the namespace alias stored in this nested name specifier.
-NamespaceAliasDecl *NestedNameSpecifier::getAsNamespaceAlias() const {
-  if (Prefix.getInt() == StoredDecl)
-    return dyn_cast<NamespaceAliasDecl>(static_cast<NamedDecl *>(Specifier));
+    return dyn_cast<NamespaceBaseDecl>(static_cast<NamedDecl *>(Specifier));
 
   return nullptr;
 }
@@ -204,7 +178,6 @@ NestedNameSpecifierDependence NestedNameSpecifier::getDependence() const {
   }
 
   case Namespace:
-  case NamespaceAlias:
   case Global:
     return NestedNameSpecifierDependence::None;
 
@@ -284,7 +257,6 @@ NestedNameSpecifier::translateToType(const ASTContext &Context) const {
   }
   case SpecifierKind::Global:
   case SpecifierKind::Namespace:
-  case SpecifierKind::NamespaceAlias:
   case SpecifierKind::Super:
     // These are not representable as types.
     return nullptr;
@@ -305,16 +277,16 @@ void NestedNameSpecifier::print(raw_ostream &OS, const PrintingPolicy &Policy,
     OS << getAsIdentifier()->getName();
     break;
 
-  case Namespace:
-    if (getAsNamespace()->isAnonymousNamespace())
-      return;
-
-    OS << getAsNamespace()->getName();
-    break;
-
-  case NamespaceAlias:
-    OS << getAsNamespaceAlias()->getName();
+  case Namespace: {
+    NamespaceBaseDecl *Namespace = getAsNamespace();
+    if (const auto *NS = dyn_cast<NamespaceDecl>(Namespace)) {
+      assert(!NS->isAnonymousNamespace());
+      OS << NS->getName();
+    } else {
+      OS << cast<NamespaceAliasDecl>(Namespace)->getName();
+    }
     break;
+  }
 
   case Global:
     OS << "::";
@@ -367,7 +339,6 @@ NestedNameSpecifierLoc::getLocalDataLength(NestedNameSpecifier *Qualifier) {
 
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Super:
     // The location of the identifier or namespace name.
     Length += sizeof(SourceLocation::UIntTy);
@@ -418,7 +389,6 @@ SourceRange NestedNameSpecifierLoc::getLocalSourceRange() const {
 
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Super:
     return SourceRange(
         LoadSourceLocation(Data, Offset),
@@ -569,7 +539,7 @@ void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
 }
 
 void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
-                                           NamespaceDecl *Namespace,
+                                           NamespaceBaseDecl *Namespace,
                                            SourceLocation NamespaceLoc,
                                            SourceLocation ColonColonLoc) {
   Representation = NestedNameSpecifier::Create(Context, Representation,
@@ -580,17 +550,6 @@ void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
   SaveSourceLocation(ColonColonLoc, Buffer, BufferSize, BufferCapacity);
 }
 
-void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context,
-                                           NamespaceAliasDecl *Alias,
-                                           SourceLocation AliasLoc,
-                                           SourceLocation ColonColonLoc) {
-  Representation = NestedNameSpecifier::Create(Context, Representation, Alias);
-
-  // Push source-location info into the buffer.
-  SaveSourceLocation(AliasLoc, Buffer, BufferSize, BufferCapacity);
-  SaveSourceLocation(ColonColonLoc, Buffer, BufferSize, BufferCapacity);
-}
-
 void NestedNameSpecifierLocBuilder::MakeGlobal(ASTContext &Context,
                                                SourceLocation ColonColonLoc) {
   assert(!Representation && "Already have a nested-name-specifier!?");
@@ -627,7 +586,6 @@ void NestedNameSpecifierLocBuilder::MakeTrivial(ASTContext &Context,
     switch (NNS->getKind()) {
       case NestedNameSpecifier::Identifier:
       case NestedNameSpecifier::Namespace:
-      case NestedNameSpecifier::NamespaceAlias:
         SaveSourceLocation(R.getBegin(), Buffer, BufferSize, BufferCapacity);
         break;
 
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 7fdfcfa3014f3..bd87d4418484b 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -127,9 +127,6 @@ void ODRHash::AddNestedNameSpecifier(const NestedNameSpecifier *NNS) {
   case NestedNameSpecifier::Namespace:
     AddDecl(NNS->getAsNamespace());
     break;
-  case NestedNameSpecifier::NamespaceAlias:
-    AddDecl(NNS->getAsNamespaceAlias());
-    break;
   case NestedNameSpecifier::TypeSpec:
     AddType(NNS->getAsType());
     break;
diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp
index 39703d6d7b882..b43bcd8d1f1c1 100644
--- a/clang/lib/AST/QualTypeNames.cpp
+++ b/clang/lib/AST/QualTypeNames.cpp
@@ -218,16 +218,7 @@ static NestedNameSpecifier *getFullyQualifiedNestedNameSpecifier(
       return Scope;
     case NestedNameSpecifier::Namespace:
       return TypeName::createNestedNameSpecifier(
-          Ctx, Scope->getAsNamespace(), WithGlobalNsPrefix);
-    case NestedNameSpecifier::NamespaceAlias:
-      // Namespace aliases are only valid for the duration of the
-      // scope where they were introduced, and therefore are often
-      // invalid at the end of the TU.  So use the namespace name more
-      // likely to be valid at the end of the TU.
-      return TypeName::createNestedNameSpecifier(
-          Ctx,
-          Scope->getAsNamespaceAlias()->getNamespace()->getCanonicalDecl(),
-          WithGlobalNsPrefix);
+          Ctx, Scope->getAsNamespace()->getNamespace(), WithGlobalNsPrefix);
     case NestedNameSpecifier::Identifier:
       // A function or some other construct that makes it un-namable
       // at the end of the TU. Skip the current component of the name,
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index be02bdde38a3d..6ba5ec89964a9 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -610,7 +610,7 @@ void StmtPrinter::VisitObjCAtTryStmt(ObjCAtTryStmt *Node) {
     }
   }
 
-  if (auto *FS = static_cast<ObjCAtFinallyStmt *>(Node->getFinallyStmt())) {
+  if (ObjCAtFinallyStmt *FS = Node->getFinallyStmt()) {
     Indent() << "@finally";
     if (auto *CS = dyn_cast<CompoundStmt>(FS->getFinallyBody())) {
       PrintRawCompoundStmt(CS);
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 9d7c2757d6ee4..3d9397fb0b540 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1050,10 +1050,6 @@ void clang::TextNodeDumper::dumpNestedNameSpecifier(const NestedNameSpecifier *N
       OS << " "; // "Namespace" is printed as the decl kind.
       dumpBareDeclRef(NNS->getAsNamespace());
       break;
-    case NestedNameSpecifier::NamespaceAlias:
-      OS << " "; // "NamespaceAlias" is printed as the decl kind.
-      dumpBareDeclRef(NNS->getAsNamespaceAlias());
-      break;
     case NestedNameSpecifier::TypeSpec:
       OS << " TypeSpec";
       dumpType(QualType(NNS->getAsType(), 0));
diff --git a/clang/lib/ASTMatchers/Dynamic/Parser.cpp b/clang/lib/ASTMatchers/Dynamic/Parser.cpp
index 8a5ac4d0f9d0c..baac797f09ec3 100644
--- a/clang/lib/ASTMatchers/Dynamic/Parser.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Parser.cpp
@@ -490,6 +490,11 @@ bool Parser::parseMatcherBuilder(MatcherCtor Ctor, const TokenInfo &NameToken,
               << CommaToken.Text;
           return false;
         }
+        // Allow for a trailing , token and possibly a new line.
+        Tokenizer->SkipNewlines();
+        if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
+          continue;
+        }
       }
 
       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
@@ -658,6 +663,11 @@ bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
               << CommaToken.Text;
           return false;
         }
+        // Allow for a trailing , token and possibly a new line.
+        Tokenizer->SkipNewlines();
+        if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
+          continue;
+        }
       }
 
       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp
index bf67bea6c9933..e3a03cf93880e 100644
--- a/clang/lib/Analysis/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ImmutableMap.h"
 #include "llvm/ADT/ImmutableSet.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TimeProfiler.h"
@@ -496,7 +497,177 @@ class FactGenerator : public ConstStmtVisitor<FactGenerator> {
 };
 
 // ========================================================================= //
-//                              The Dataflow Lattice
+//                         Generic Dataflow Analysis
+// ========================================================================= //
+
+enum class Direction { Forward, Backward };
+
+/// A generic, policy-based driver for dataflow analyses. It combines
+/// the dataflow runner and the transferer logic into a single class hierarchy.
+///
+/// The derived class is expected to provide:
+/// - A `Lattice` type.
+/// - `StringRef getAnalysisName() const`
+/// - `Lattice getInitialState();` The initial state of the analysis.
+/// - `Lattice join(Lattice, Lattice);` Merges states from multiple CFG paths.
+/// - `Lattice transfer(Lattice, const FactType&);` Defines how a single
+///   lifetime-relevant `Fact` transforms the lattice state. Only overloads
+///   for facts relevant to the analysis need to be implemented.
+///
+/// \tparam Derived The CRTP derived class that implements the specific
+/// analysis.
+/// \tparam LatticeType The dataflow lattice used by the analysis.
+/// \tparam Dir The direction of the analysis (Forward or Backward).
+/// TODO: Maybe use the dataflow framework! The framework might need changes
+/// to support the current comparison done at block-entry.
+template <typename Derived, typename LatticeType, Direction Dir>
+class DataflowAnalysis {
+public:
+  using Lattice = LatticeType;
+  using Base = DataflowAnalysis<Derived, LatticeType, Dir>;
+
+private:
+  const CFG &Cfg;
+  AnalysisDeclContext &AC;
+
+  llvm::DenseMap<const CFGBlock *, Lattice> InStates;
+  llvm::DenseMap<const CFGBlock *, Lattice> OutStates;
+
+  static constexpr bool isForward() { return Dir == Direction::Forward; }
+
+protected:
+  FactManager &AllFacts;
+
+  explicit DataflowAnalysis(const CFG &C, AnalysisDeclContext &AC,
+                            FactManager &F)
+      : Cfg(C), AC(AC), AllFacts(F) {}
+
+public:
+  void run() {
+    Derived &D = static_cast<Derived &>(*this);
+    llvm::TimeTraceScope Time(D.getAnalysisName());
+
+    using Worklist =
+        std::conditional_t<Dir == Direction::Forward, ForwardDataflowWorklist,
+                           BackwardDataflowWorklist>;
+    Worklist W(Cfg, AC);
+
+    const CFGBlock *Start = isForward() ? &Cfg.getEntry() : &Cfg.getExit();
+    InStates[Start] = D.getInitialState();
+    W.enqueueBlock(Start);
+
+    llvm::SmallBitVector Visited(Cfg.getNumBlockIDs() + 1);
+
+    while (const CFGBlock *B = W.dequeue()) {
+      Lattice StateIn = getInState(B);
+      Lattice StateOut = transferBlock(B, StateIn);
+      OutStates[B] = StateOut;
+      Visited.set(B->getBlockID());
+      for (const CFGBlock *AdjacentB : isForward() ? B->succs() : B->preds()) {
+        Lattice OldInState = getInState(AdjacentB);
+        Lattice NewInState = D.join(OldInState, StateOut);
+        // Enqueue the adjacent block if its in-state has changed or if we have
+        // never visited it.
+        if (!Visited.test(AdjacentB->getBlockID()) ||
+            NewInState != OldInState) {
+          InStates[AdjacentB] = NewInState;
+          W.enqueueBlock(AdjacentB);
+        }
+      }
+    }
+  }
+
+  Lattice getInState(const CFGBlock *B) const { return InStates.lookup(B); }
+
+  Lattice getOutState(const CFGBlock *B) const { return OutStates.lookup(B); }
+
+  void dump() const {
+    const Derived *D = static_cast<const Derived *>(this);
+    llvm::dbgs() << "==========================================\n";
+    llvm::dbgs() << D->getAnalysisName() << " results:\n";
+    llvm::dbgs() << "==========================================\n";
+    const CFGBlock &B = isForward() ? Cfg.getExit() : Cfg.getEntry();
+    getOutState(&B).dump(llvm::dbgs());
+  }
+
+  /// Computes the state at one end of a block by applying all its facts
+  /// sequentially to a given state from the other end.
+  /// TODO: We might need to store intermediate states per-fact in the block for
+  /// later analysis.
+  Lattice transferBlock(const CFGBlock *Block, Lattice State) {
+    auto Facts = AllFacts.getFacts(Block);
+    if constexpr (isForward())
+      for (const Fact *F : Facts)
+        State = transferFact(State, F);
+    else
+      for (const Fact *F : llvm::reverse(Facts))
+        State = transferFact(State, F);
+    return State;
+  }
+
+  Lattice transferFact(Lattice In, const Fact *F) {
+    assert(F);
+    Derived *D = static_cast<Derived *>(this);
+    switch (F->getKind()) {
+    case Fact::Kind::Issue:
+      return D->transfer(In, *F->getAs<IssueFact>());
+    case Fact::Kind::Expire:
+      return D->transfer(In, *F->getAs<ExpireFact>());
+    case Fact::Kind::AssignOrigin:
+      return D->transfer(In, *F->getAs<AssignOriginFact>());
+    case Fact::Kind::ReturnOfOrigin:
+      return D->transfer(In, *F->getAs<ReturnOfOriginFact>());
+    }
+    llvm_unreachable("Unknown fact kind");
+  }
+
+public:
+  Lattice transfer(Lattice In, const IssueFact &) { return In; }
+  Lattice transfer(Lattice In, const ExpireFact &) { return In; }
+  Lattice transfer(Lattice In, const AssignOriginFact &) { return In; }
+  Lattice transfer(Lattice In, const ReturnOfOriginFact &) { return In; }
+};
+
+namespace utils {
+
+/// Computes the union of two ImmutableSets.
+template <typename T>
+llvm::ImmutableSet<T> join(llvm::ImmutableSet<T> A, llvm::ImmutableSet<T> B,
+                           typename llvm::ImmutableSet<T>::Factory &F) {
+  if (A.getHeight() < B.getHeight())
+    std::swap(A, B);
+  for (const T &E : B)
+    A = F.add(A, E);
+  return A;
+}
+
+/// Computes the key-wise union of two ImmutableMaps.
+// TODO(opt): This key-wise join is a performance bottleneck. A more
+// efficient merge could be implemented using a Patricia Trie or HAMT
+// instead of the current AVL-tree-based ImmutableMap.
+template <typename K, typename V, typename Joiner>
+llvm::ImmutableMap<K, V>
+join(llvm::ImmutableMap<K, V> A, llvm::ImmutableMap<K, V> B,
+     typename llvm::ImmutableMap<K, V>::Factory &F, Joiner joinValues) {
+  if (A.getHeight() < B.getHeight())
+    std::swap(A, B);
+
+  // For each element in B, join it with the corresponding element in A
+  // (or with an empty value if it doesn't exist in A).
+  for (const auto &Entry : B) {
+    const K &Key = Entry.first;
+    const V &ValB = Entry.second;
+    if (const V *ValA = A.lookup(Key))
+      A = F.add(A, Key, joinValues(*ValA, ValB));
+    else
+      A = F.add(A, Key, ValB);
+  }
+  return A;
+}
+} // namespace utils
+
+// ========================================================================= //
+//                          Loan Propagation Analysis
 // ========================================================================= //
 
 // Using LLVM's immutable collections is efficient for dataflow analysis
@@ -509,82 +680,37 @@ using OriginLoanMap = llvm::ImmutableMap<OriginID, LoanSet>;
 /// that all created states share the same underlying memory management.
 struct LifetimeFactory {
   OriginLoanMap::Factory OriginMapFactory;
-  LoanSet::Factory LoanSetFact;
+  LoanSet::Factory LoanSetFactory;
 
   /// Creates a singleton set containing only the given loan ID.
   LoanSet createLoanSet(LoanID LID) {
-    return LoanSetFact.add(LoanSetFact.getEmptySet(), LID);
+    return LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID);
   }
 };
 
-/// LifetimeLattice represents the state of our analysis at a given program
-/// point. It is an immutable object, and all operations produce a new
-/// instance rather than modifying the existing one.
-struct LifetimeLattice {
+/// Represents the dataflow lattice for loan propagation.
+///
+/// This lattice tracks which loans each origin may hold at a given program
+/// point.The lattice has a finite height: An origin's loan set is bounded by
+/// the total number of loans in the function.
+/// TODO(opt): To reduce the lattice size, propagate origins of declarations,
+/// not expressions, because expressions are not visible across blocks.
+struct LoanPropagationLattice {
   /// The map from an origin to the set of loans it contains.
-  /// The lattice has a finite height: An origin's loan set is bounded by the
-  /// total number of loans in the function.
-  /// TODO(opt): To reduce the lattice size, propagate origins of declarations,
-  /// not expressions, because expressions are not visible across blocks.
   OriginLoanMap Origins = OriginLoanMap(nullptr);
 
-  explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {}
-  LifetimeLattice() = default;
+  explicit LoanPropagationLattice(const OriginLoanMap &S) : Origins(S) {}
+  LoanPropagationLattice() = default;
 
-  bool operator==(const LifetimeLattice &Other) const {
+  bool operator==(const LoanPropagationLattice &Other) const {
     return Origins == Other.Origins;
   }
-  bool operator!=(const LifetimeLattice &Other) const {
+  bool operator!=(const LoanPropagationLattice &Other) const {
     return !(*this == Other);
   }
 
-  LoanSet getLoans(OriginID OID) const {
-    if (auto *Loans = Origins.lookup(OID))
-      return *Loans;
-    return LoanSet(nullptr);
-  }
-
-  /// Computes the union of two lattices by performing a key-wise join of
-  /// their OriginLoanMaps.
-  // TODO(opt): This key-wise join is a performance bottleneck. A more
-  // efficient merge could be implemented using a Patricia Trie or HAMT
-  // instead of the current AVL-tree-based ImmutableMap.
-  // TODO(opt): Keep the state small by removing origins which become dead.
-  LifetimeLattice join(const LifetimeLattice &Other,
-                       LifetimeFactory &Factory) const {
-    /// Merge the smaller map into the larger one ensuring we iterate over the
-    /// smaller map.
-    if (Origins.getHeight() < Other.Origins.getHeight())
-      return Other.join(*this, Factory);
-
-    OriginLoanMap JoinedState = Origins;
-    // For each origin in the other map, union its loan set with ours.
-    for (const auto &Entry : Other.Origins) {
-      OriginID OID = Entry.first;
-      LoanSet OtherLoanSet = Entry.second;
-      JoinedState = Factory.OriginMapFactory.add(
-          JoinedState, OID, join(getLoans(OID), OtherLoanSet, Factory));
-    }
-    return LifetimeLattice(JoinedState);
-  }
-
-  LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const {
-    /// Merge the smaller set into the larger one ensuring we iterate over the
-    /// smaller set.
-    if (a.getHeight() < b.getHeight())
-      std::swap(a, b);
-    LoanSet Result = a;
-    for (LoanID LID : b) {
-      /// TODO(opt): Profiling shows that this loop is a major performance
-      /// bottleneck. Investigate using a BitVector to represent the set of
-      /// loans for improved join performance.
-      Result = Factory.LoanSetFact.add(Result, LID);
-    }
-    return Result;
-  }
-
   void dump(llvm::raw_ostream &OS) const {
-    OS << "LifetimeLattice State:\n";
+    OS << "LoanPropagationLattice State:\n";
     if (Origins.isEmpty())
       OS << "  <empty>\n";
     for (const auto &Entry : Origins) {
@@ -596,143 +722,67 @@ struct LifetimeLattice {
   }
 };
 
-// ========================================================================= //
-//                              The Transfer Function
-// ========================================================================= //
-class Transferer {
-  FactManager &AllFacts;
+/// The analysis that tracks which loans belong to which origins.
+class LoanPropagationAnalysis
+    : public DataflowAnalysis<LoanPropagationAnalysis, LoanPropagationLattice,
+                              Direction::Forward> {
+
   LifetimeFactory &Factory;
 
 public:
-  explicit Transferer(FactManager &F, LifetimeFactory &Factory)
-      : AllFacts(F), Factory(Factory) {}
+  LoanPropagationAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F,
+                          LifetimeFactory &Factory)
+      : DataflowAnalysis(C, AC, F), Factory(Factory) {}
 
-  /// Computes the exit state of a block by applying all its facts sequentially
-  /// to a given entry state.
-  /// TODO: We might need to store intermediate states per-fact in the block for
-  /// later analysis.
-  LifetimeLattice transferBlock(const CFGBlock *Block,
-                                LifetimeLattice EntryState) {
-    LifetimeLattice BlockState = EntryState;
-    llvm::ArrayRef<const Fact *> Facts = AllFacts.getFacts(Block);
+  using Base::transfer;
 
-    for (const Fact *F : Facts) {
-      BlockState = transferFact(BlockState, F);
-    }
-    return BlockState;
-  }
+  StringRef getAnalysisName() const { return "LoanPropagation"; }
 
-private:
-  LifetimeLattice transferFact(LifetimeLattice In, const Fact *F) {
-    switch (F->getKind()) {
-    case Fact::Kind::Issue:
-      return transfer(In, *F->getAs<IssueFact>());
-    case Fact::Kind::AssignOrigin:
-      return transfer(In, *F->getAs<AssignOriginFact>());
-    // Expire and ReturnOfOrigin facts don't modify the Origins and the State.
-    case Fact::Kind::Expire:
-    case Fact::Kind::ReturnOfOrigin:
-      return In;
-    }
-    llvm_unreachable("Unknown fact kind");
+  Lattice getInitialState() { return Lattice{}; }
+
+  /// Merges two lattices by taking the union of loans for each origin.
+  // TODO(opt): Keep the state small by removing origins which become dead.
+  Lattice join(Lattice A, Lattice B) {
+    OriginLoanMap JoinedOrigins =
+        utils::join(A.Origins, B.Origins, Factory.OriginMapFactory,
+                    [this](LoanSet S1, LoanSet S2) {
+                      return utils::join(S1, S2, Factory.LoanSetFactory);
+                    });
+    return Lattice(JoinedOrigins);
   }
 
   /// A new loan is issued to the origin. Old loans are erased.
-  LifetimeLattice transfer(LifetimeLattice In, const IssueFact &F) {
+  Lattice transfer(Lattice In, const IssueFact &F) {
     OriginID OID = F.getOriginID();
     LoanID LID = F.getLoanID();
-    return LifetimeLattice(Factory.OriginMapFactory.add(
+    return LoanPropagationLattice(Factory.OriginMapFactory.add(
         In.Origins, OID, Factory.createLoanSet(LID)));
   }
 
   /// The destination origin's loan set is replaced by the source's.
   /// This implicitly "resets" the old loans of the destination.
-  LifetimeLattice transfer(LifetimeLattice InState, const AssignOriginFact &F) {
+  Lattice transfer(Lattice In, const AssignOriginFact &F) {
     OriginID DestOID = F.getDestOriginID();
     OriginID SrcOID = F.getSrcOriginID();
-    LoanSet SrcLoans = InState.getLoans(SrcOID);
-    return LifetimeLattice(
-        Factory.OriginMapFactory.add(InState.Origins, DestOID, SrcLoans));
-  }
-};
-
-// ========================================================================= //
-//                              Dataflow analysis
-// ========================================================================= //
-
-/// Drives the intra-procedural dataflow analysis.
-///
-/// Orchestrates the analysis by iterating over the CFG using a worklist
-/// algorithm. It computes a fixed point by propagating the LifetimeLattice
-/// state through each block until the state no longer changes.
-/// TODO: Maybe use the dataflow framework! The framework might need changes
-/// to support the current comparison done at block-entry.
-class LifetimeDataflow {
-  const CFG &Cfg;
-  AnalysisDeclContext &AC;
-  LifetimeFactory LifetimeFact;
-
-  Transferer Xfer;
-
-  /// Stores the merged analysis state at the entry of each CFG block.
-  llvm::DenseMap<const CFGBlock *, LifetimeLattice> BlockEntryStates;
-  /// Stores the analysis state at the exit of each CFG block, after the
-  /// transfer function has been applied.
-  llvm::DenseMap<const CFGBlock *, LifetimeLattice> BlockExitStates;
-
-public:
-  LifetimeDataflow(const CFG &C, FactManager &FS, AnalysisDeclContext &AC)
-      : Cfg(C), AC(AC), Xfer(FS, LifetimeFact) {}
-
-  void run() {
-    llvm::TimeTraceScope TimeProfile("Lifetime Dataflow");
-    ForwardDataflowWorklist Worklist(Cfg, AC);
-    const CFGBlock *Entry = &Cfg.getEntry();
-    BlockEntryStates[Entry] = LifetimeLattice{};
-    Worklist.enqueueBlock(Entry);
-    while (const CFGBlock *B = Worklist.dequeue()) {
-      LifetimeLattice EntryState = getEntryState(B);
-      LifetimeLattice ExitState = Xfer.transferBlock(B, EntryState);
-      BlockExitStates[B] = ExitState;
-
-      for (const CFGBlock *Successor : B->succs()) {
-        auto SuccIt = BlockEntryStates.find(Successor);
-        LifetimeLattice OldSuccEntryState = (SuccIt != BlockEntryStates.end())
-                                                ? SuccIt->second
-                                                : LifetimeLattice{};
-        LifetimeLattice NewSuccEntryState =
-            OldSuccEntryState.join(ExitState, LifetimeFact);
-        // Enqueue the successor if its entry state has changed.
-        // TODO(opt): Consider changing 'join' to report a change if !=
-        // comparison is found expensive.
-        if (SuccIt == BlockEntryStates.end() ||
-            NewSuccEntryState != OldSuccEntryState) {
-          BlockEntryStates[Successor] = NewSuccEntryState;
-          Worklist.enqueueBlock(Successor);
-        }
-      }
-    }
+    LoanSet SrcLoans = getLoans(In, SrcOID);
+    return LoanPropagationLattice(
+        Factory.OriginMapFactory.add(In.Origins, DestOID, SrcLoans));
   }
 
-  void dump() const {
-    llvm::dbgs() << "==========================================\n";
-    llvm::dbgs() << "       Dataflow results:\n";
-    llvm::dbgs() << "==========================================\n";
-    const CFGBlock &B = Cfg.getExit();
-    getExitState(&B).dump(llvm::dbgs());
-  }
-
-  LifetimeLattice getEntryState(const CFGBlock *B) const {
-    return BlockEntryStates.lookup(B);
-  }
-
-  LifetimeLattice getExitState(const CFGBlock *B) const {
-    return BlockExitStates.lookup(B);
+private:
+  LoanSet getLoans(Lattice L, OriginID OID) {
+    if (auto *Loans = L.Origins.lookup(OID))
+      return *Loans;
+    return Factory.LoanSetFactory.getEmptySet();
   }
 };
 
 // ========================================================================= //
-//  TODO: Analysing dataflow results and error reporting.
+//  TODO:
+// - Modifying loan propagation to answer `LoanSet getLoans(Origin O, Point P)`
+// - Modify loan expiry analysis to answer `bool isExpired(Loan L, Point P)`
+// - Modify origin liveness analysis to answer `bool isLive(Origin O, Point P)`
+// - Using the above three to perform the final error reporting.
 // ========================================================================= //
 } // anonymous namespace
 
@@ -755,8 +805,9 @@ void runLifetimeSafetyAnalysis(const DeclContext &DC, const CFG &Cfg,
   ///    blocks; only Decls are visible.  Therefore, loans in a block that
   ///    never reach an Origin associated with a Decl can be safely dropped by
   ///    the analysis.
-  LifetimeDataflow Dataflow(Cfg, FactMgr, AC);
-  Dataflow.run();
-  DEBUG_WITH_TYPE("LifetimeDataflow", Dataflow.dump());
+  LifetimeFactory Factory;
+  LoanPropagationAnalysis LoanPropagation(Cfg, AC, FactMgr, Factory);
+  LoanPropagation.run();
+  DEBUG_WITH_TYPE("LifetimeLoanPropagation", LoanPropagation.dump());
 }
 } // namespace clang
diff --git a/clang/lib/Analysis/UninitializedValues.cpp b/clang/lib/Analysis/UninitializedValues.cpp
index 8c9cf8dac79ed..f6b1c67ab20c3 100644
--- a/clang/lib/Analysis/UninitializedValues.cpp
+++ b/clang/lib/Analysis/UninitializedValues.cpp
@@ -276,13 +276,7 @@ namespace {
 /// escaped the analysis and will be treated as an initialization.
 class ClassifyRefs : public StmtVisitor<ClassifyRefs> {
 public:
-  enum Class {
-    Init,
-    Use,
-    SelfInit,
-    ConstRefUse,
-    Ignore
-  };
+  enum Class { Init, Use, SelfInit, ConstRefUse, ConstPtrUse, Ignore };
 
 private:
   const DeclContext *DC;
@@ -451,8 +445,7 @@ void ClassifyRefs::VisitCallExpr(CallExpr *CE) {
       const Expr *Ex = stripCasts(DC->getParentASTContext(), *I);
       const auto *UO = dyn_cast<UnaryOperator>(Ex);
       if (UO && UO->getOpcode() == UO_AddrOf)
-        Ex = UO->getSubExpr();
-      classify(Ex, Ignore);
+        classify(UO->getSubExpr(), isTrivialBody ? Ignore : ConstPtrUse);
     }
   }
 }
@@ -496,6 +489,7 @@ class TransferFunctions : public StmtVisitor<TransferFunctions> {
 
   void reportUse(const Expr *ex, const VarDecl *vd);
   void reportConstRefUse(const Expr *ex, const VarDecl *vd);
+  void reportConstPtrUse(const Expr *ex, const VarDecl *vd);
 
   void VisitBinaryOperator(BinaryOperator *bo);
   void VisitBlockExpr(BlockExpr *be);
@@ -682,6 +676,15 @@ void TransferFunctions::reportConstRefUse(const Expr *ex, const VarDecl *vd) {
   }
 }
 
+void TransferFunctions::reportConstPtrUse(const Expr *ex, const VarDecl *vd) {
+  Value v = vals[vd];
+  if (isAlwaysUninit(v)) {
+    auto use = getUninitUse(ex, vd, v);
+    use.setConstPtrUse();
+    handler.handleUseOfUninitVariable(vd, use);
+  }
+}
+
 void TransferFunctions::VisitObjCForCollectionStmt(ObjCForCollectionStmt *FS) {
   // This represents an initialization of the 'element' value.
   if (const auto *DS = dyn_cast<DeclStmt>(FS->getElement())) {
@@ -754,6 +757,9 @@ void TransferFunctions::VisitDeclRefExpr(DeclRefExpr *dr) {
   case ClassifyRefs::ConstRefUse:
     reportConstRefUse(dr, cast<VarDecl>(dr->getDecl()));
     break;
+  case ClassifyRefs::ConstPtrUse:
+    reportConstPtrUse(dr, cast<VarDecl>(dr->getDecl()));
+    break;
   }
 }
 
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 6048169b56640..ac47b12cc8d72 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -453,22 +453,110 @@ static bool areEqualIntegers(const Expr *E1, const Expr *E2, ASTContext &Ctx) {
   }
 }
 
+// Providing that `Ptr` is a pointer and `Size` is an unsigned-integral
+// expression, returns true iff they follow one of the following safe
+// patterns:
+//  1. Ptr is `DRE.data()` and Size is `DRE.size()`, where DRE is a hardened
+//     container or view;
+//
+//  2. Ptr is `a` and Size is `n`, where `a` is of an array-of-T with constant
+//     size `n`;
+//
+//  3. Ptr is `&var` and Size is `1`; or
+//     Ptr is `std::addressof(...)` and Size is `1`;
+//
+//  4. Size is `0`;
+static bool isPtrBufferSafe(const Expr *Ptr, const Expr *Size,
+                            ASTContext &Ctx) {
+  // Pattern 1:
+  if (auto *MCEPtr = dyn_cast<CXXMemberCallExpr>(Ptr->IgnoreParenImpCasts()))
+    if (auto *MCESize =
+            dyn_cast<CXXMemberCallExpr>(Size->IgnoreParenImpCasts())) {
+      auto *DREOfPtr = dyn_cast<DeclRefExpr>(
+          MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts());
+      auto *DREOfSize = dyn_cast<DeclRefExpr>(
+          MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts());
+
+      if (!DREOfPtr || !DREOfSize)
+        return false; // not in safe pattern
+      // We need to make sure 'a' is identical to 'b' for 'a.data()' and
+      // 'b.size()' otherwise we do not know they match:
+      if (DREOfPtr->getDecl() != DREOfSize->getDecl())
+        return false;
+      if (MCEPtr->getMethodDecl()->getName() != "data")
+        return false;
+      // `MCEPtr->getRecordDecl()` must be non-null as `DREOfPtr` is non-null:
+      if (!MCEPtr->getRecordDecl()->isInStdNamespace())
+        return false;
+
+      auto *ObjII = MCEPtr->getRecordDecl()->getIdentifier();
+
+      if (!ObjII)
+        return false;
+
+      bool AcceptSizeBytes = Ptr->getType()->getPointeeType()->isCharType();
+
+      if (!((AcceptSizeBytes &&
+             MCESize->getMethodDecl()->getName() == "size_bytes") ||
+            // Note here the pointer must be a pointer-to-char type unless there
+            // is explicit casting.  If there is explicit casting, this branch
+            // is unreachable. Thus, at this branch "size" and "size_bytes" are
+            // equivalent as the pointer is a char pointer:
+            MCESize->getMethodDecl()->getName() == "size"))
+        return false;
+
+      return llvm::is_contained({SIZED_CONTAINER_OR_VIEW_LIST},
+                                ObjII->getName());
+    }
+
+  Expr::EvalResult ER;
+
+  // Pattern 2-4:
+  if (Size->EvaluateAsInt(ER, Ctx)) {
+    // Pattern 2:
+    if (auto *DRE = dyn_cast<DeclRefExpr>(Ptr->IgnoreParenImpCasts())) {
+      if (auto *CAT = Ctx.getAsConstantArrayType(DRE->getType())) {
+        llvm::APSInt SizeInt = ER.Val.getInt();
+
+        return llvm::APSInt::compareValues(
+                   SizeInt, llvm::APSInt(CAT->getSize(), true)) == 0;
+      }
+      return false;
+    }
+
+    // Pattern 3:
+    if (ER.Val.getInt().isOne()) {
+      if (auto *UO = dyn_cast<UnaryOperator>(Ptr->IgnoreParenImpCasts()))
+        return UO && UO->getOpcode() == UnaryOperator::Opcode::UO_AddrOf;
+      if (auto *CE = dyn_cast<CallExpr>(Ptr->IgnoreParenImpCasts())) {
+        auto *FnDecl = CE->getDirectCallee();
+
+        return FnDecl && FnDecl->getNameAsString() == "addressof" &&
+               FnDecl->isInStdNamespace();
+      }
+      return false;
+    }
+    // Pattern 4:
+    if (ER.Val.getInt().isZero())
+      return true;
+  }
+  return false;
+}
+
 // Given a two-param std::span construct call, matches iff the call has the
 // following forms:
 //   1. `std::span<T>{new T[n], n}`, where `n` is a literal or a DRE
 //   2. `std::span<T>{new T, 1}`
-//   3. `std::span<T>{&var, 1}` or `std::span<T>{std::addressof(...), 1}`
-//   4. `std::span<T>{a, n}`, where `a` is of an array-of-T with constant size
-//   `n`
-//   5. `std::span<T>{any, 0}`
-//   6. `std::span<T>{ (char *)f(args), args[N] * arg*[M]}`, where
+//   3. `std::span<T>{ (char *)f(args), args[N] * arg*[M]}`, where
 //       `f` is a function with attribute `alloc_size(N, M)`;
 //       `args` represents the list of arguments;
 //       `N, M` are parameter indexes to the allocating element number and size.
 //        Sometimes, there is only one parameter index representing the total
 //        size.
-//   7. `std::span<T>{x.begin(), x.end()}` where `x` is an object in the
+//   4. `std::span<T>{x.begin(), x.end()}` where `x` is an object in the
 //      SIZED_CONTAINER_OR_VIEW_LIST.
+//   5. `isPtrBufferSafe` returns true for the two arguments of the span
+//      constructor
 static bool isSafeSpanTwoParamConstruct(const CXXConstructExpr &Node,
                                         ASTContext &Ctx) {
   assert(Node.getNumArgs() == 2 &&
@@ -495,7 +583,7 @@ static bool isSafeSpanTwoParamConstruct(const CXXConstructExpr &Node,
     // Check form 5:
     return true;
 
-  // Check forms 1-3:
+  // Check forms 1-2:
   switch (Arg0->getStmtClass()) {
   case Stmt::CXXNewExprClass:
     if (auto Size = cast<CXXNewExpr>(Arg0)->getArraySize()) {
@@ -509,35 +597,11 @@ static bool isSafeSpanTwoParamConstruct(const CXXConstructExpr &Node,
       return Arg1CV && Arg1CV->isOne();
     }
     break;
-  case Stmt::UnaryOperatorClass:
-    if (cast<UnaryOperator>(Arg0)->getOpcode() ==
-        UnaryOperator::Opcode::UO_AddrOf)
-      // Check form 3:
-      return Arg1CV && Arg1CV->isOne();
-    break;
-  case Stmt::CallExprClass:
-    // Check form 3:
-    if (const auto *CE = dyn_cast<CallExpr>(Arg0)) {
-      const auto FnDecl = CE->getDirectCallee();
-      if (FnDecl && FnDecl->getNameAsString() == "addressof" &&
-          FnDecl->isInStdNamespace()) {
-        return Arg1CV && Arg1CV->isOne();
-      }
-    }
-    break;
   default:
     break;
   }
 
-  QualType Arg0Ty = Arg0->IgnoreImplicit()->getType();
-
-  if (auto *ConstArrTy = Ctx.getAsConstantArrayType(Arg0Ty)) {
-    const llvm::APSInt ConstArrSize = llvm::APSInt(ConstArrTy->getSize());
-
-    // Check form 4:
-    return Arg1CV && llvm::APSInt::compareValues(ConstArrSize, *Arg1CV) == 0;
-  }
-  // Check form 6:
+  // Check form 3:
   if (auto CCast = dyn_cast<CStyleCastExpr>(Arg0)) {
     if (!CCast->getType()->isPointerType())
       return false;
@@ -566,7 +630,7 @@ static bool isSafeSpanTwoParamConstruct(const CXXConstructExpr &Node,
         }
     }
   }
-  // Check form 7:
+  // Check form 4:
   auto IsMethodCallToSizedObject = [](const Stmt *Node, StringRef MethodName) {
     if (const auto *MC = dyn_cast<CXXMemberCallExpr>(Node)) {
       const auto *MD = MC->getMethodDecl();
@@ -592,7 +656,9 @@ static bool isSafeSpanTwoParamConstruct(const CXXConstructExpr &Node,
         cast<CXXMemberCallExpr>(Arg1)
             ->getImplicitObjectArgument()
             ->IgnoreParenImpCasts());
-  return false;
+
+  // Check 5:
+  return isPtrBufferSafe(Arg0, Arg1, Ctx);
 }
 
 static bool isSafeArraySubscript(const ArraySubscriptExpr &Node,
@@ -1052,24 +1118,11 @@ static bool hasUnsafePrintfStringArg(const CallExpr &Node, ASTContext &Ctx,
   return false;
 }
 
-// This matcher requires that it is known that the callee `isNormalPrintf`.
-// Then it matches if the first two arguments of the call is a pointer and an
-// integer and they are not in a safe pattern.
-//
-// For the first two arguments: `ptr` and `size`, they are safe if in the
-// following patterns:
-//
-// Pattern 1:
-//    ptr := DRE.data();
-//    size:= DRE.size()/DRE.size_bytes()
-// And DRE is a hardened container or view.
-//
-// Pattern 2:
-//    ptr := Constant-Array-DRE;
-//    size:= any expression that has compile-time constant value equivalent to
-//           sizeof (Constant-Array-DRE)
-static bool hasUnsafeSnprintfBuffer(const CallExpr &Node,
-                                    const ASTContext &Ctx) {
+// This function requires that it is known that the callee `isNormalPrintf`.
+// It returns true iff the first two arguments of the call is a pointer
+// `Ptr` and an unsigned integer `Size` and they are NOT safe, i.e.,
+// `!isPtrBufferSafe(Ptr, Size)`.
+static bool hasUnsafeSnprintfBuffer(const CallExpr &Node, ASTContext &Ctx) {
   const FunctionDecl *FD = Node.getDirectCallee();
 
   assert(FD && "It should have been checked that FD is non-null.");
@@ -1085,57 +1138,12 @@ static bool hasUnsafeSnprintfBuffer(const CallExpr &Node,
   QualType FirstPteTy = FirstParmTy->castAs<PointerType>()->getPointeeType();
   const Expr *Buf = Node.getArg(0), *Size = Node.getArg(1);
 
-  if (FirstPteTy.isConstQualified() || !Buf->getType()->isPointerType() ||
-      !Size->getType()->isIntegerType())
+  if (FirstPteTy.isConstQualified() || !FirstPteTy->isAnyCharacterType() ||
+      !Buf->getType()->isPointerType() ||
+      !Size->getType()->isUnsignedIntegerType())
     return false; // not an snprintf call
 
-  // Pattern 1:
-  static StringRef SizedObjs[] = {SIZED_CONTAINER_OR_VIEW_LIST};
-  Buf = Buf->IgnoreParenImpCasts();
-  Size = Size->IgnoreParenImpCasts();
-  if (auto *MCEPtr = dyn_cast<CXXMemberCallExpr>(Buf))
-    if (auto *MCESize = dyn_cast<CXXMemberCallExpr>(Size)) {
-      auto *DREOfPtr = dyn_cast<DeclRefExpr>(
-          MCEPtr->getImplicitObjectArgument()->IgnoreParenImpCasts());
-      auto *DREOfSize = dyn_cast<DeclRefExpr>(
-          MCESize->getImplicitObjectArgument()->IgnoreParenImpCasts());
-
-      if (!DREOfPtr || !DREOfSize)
-        return true; // not in safe pattern
-      if (DREOfPtr->getDecl() != DREOfSize->getDecl())
-        return true; // not in safe pattern
-      if (MCEPtr->getMethodDecl()->getName() != "data")
-        return true; // not in safe pattern
-
-      if (MCESize->getMethodDecl()->getName() == "size_bytes" ||
-          // Note here the pointer must be a pointer-to-char type unless there
-          // is explicit casting.  If there is explicit casting, this branch
-          // is unreachable. Thus, at this branch "size" and "size_bytes" are
-          // equivalent as the pointer is a char pointer:
-          MCESize->getMethodDecl()->getName() == "size")
-        for (StringRef SizedObj : SizedObjs)
-          if (MCEPtr->getRecordDecl()->isInStdNamespace() &&
-              MCEPtr->getRecordDecl()->getCanonicalDecl()->getName() ==
-                  SizedObj)
-            return false; // It is in fact safe
-    }
-
-  // Pattern 2:
-  if (auto *DRE = dyn_cast<DeclRefExpr>(Buf->IgnoreParenImpCasts())) {
-    if (auto *CAT = Ctx.getAsConstantArrayType(DRE->getType())) {
-      Expr::EvalResult ER;
-      // The array element type must be compatible with `char` otherwise an
-      // explicit cast will be needed, which will make this check unreachable.
-      // Therefore, the array extent is same as its' bytewise size.
-      if (Size->EvaluateAsInt(ER, Ctx)) {
-        llvm::APSInt EVal = ER.Val.getInt(); // Size must have integer type
-
-        return llvm::APSInt::compareValues(
-                   EVal, llvm::APSInt(CAT->getSize(), true)) != 0;
-      }
-    }
-  }
-  return true; // ptr and size are not in safe pattern
+  return !isPtrBufferSafe(Buf, Size, Ctx);
 }
 } // namespace libc_func_matchers
 
diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt
index f8a31c890ac4d..adfc6ee326b5a 100644
--- a/clang/lib/Basic/CMakeLists.txt
+++ b/clang/lib/Basic/CMakeLists.txt
@@ -111,7 +111,6 @@ add_clang_library(clangBasic
   Targets/Mips.cpp
   Targets/NVPTX.cpp
   Targets/OSTargets.cpp
-  Targets/PNaCl.cpp
   Targets/PPC.cpp
   Targets/RISCV.cpp
   Targets/SPIR.cpp
diff --git a/clang/lib/Basic/CodeGenOptions.cpp b/clang/lib/Basic/CodeGenOptions.cpp
index 9a2aa9beb9e0d..db8a77cd93cf6 100644
--- a/clang/lib/Basic/CodeGenOptions.cpp
+++ b/clang/lib/Basic/CodeGenOptions.cpp
@@ -36,13 +36,13 @@ void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) {
   // emitted into the PCM (-gmodules).
   if (ModuleFormat == "raw" && !DebugTypeExtRefs) {
 #define DEBUGOPT(Name, Bits, Default, Compatibility)                           \
-  if constexpr (CK::Compatibility == CK::Affecting)                            \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
     Name = Default;
 #define VALUE_DEBUGOPT(Name, Bits, Default, Compatibility)                     \
-  if constexpr (CK::Compatibility == CK::Affecting)                            \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
     Name = Default;
 #define ENUM_DEBUGOPT(Name, Type, Bits, Default, Compatibility)                \
-  if constexpr (CK::Compatibility == CK::Affecting)                            \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
     set##Name(Default);
 #include "clang/Basic/DebugOptions.def"
   }
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index d3d393bd09396..220b31b0f19bc 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -378,7 +378,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
       return #Name;
 #include "clang/Basic/OpenMPKinds.def"
     }
-    llvm_unreachable("Invalid OpenMP 'schedule' clause type");
+    llvm_unreachable("Invalid OpenMP 'defaultmap' clause type");
   case OMPC_atomic_default_mem_order:
     switch (Type) {
     case OMPC_ATOMIC_DEFAULT_MEM_ORDER_unknown:
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index af1111a863308..5c2af9b080b83 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -29,7 +29,6 @@
 #include "Targets/Mips.h"
 #include "Targets/NVPTX.h"
 #include "Targets/OSTargets.h"
-#include "Targets/PNaCl.h"
 #include "Targets/PPC.h"
 #include "Targets/RISCV.h"
 #include "Targets/SPIR.h"
@@ -228,8 +227,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<RTEMSTargetInfo<ARMleTargetInfo>>(Triple, Opts);
     case llvm::Triple::Haiku:
       return std::make_unique<HaikuTargetInfo<ARMleTargetInfo>>(Triple, Opts);
-    case llvm::Triple::NaCl:
-      return std::make_unique<NaClTargetInfo<ARMleTargetInfo>>(Triple, Opts);
     case llvm::Triple::Win32:
       switch (Triple.getEnvironment()) {
       case llvm::Triple::Cygnus:
@@ -260,8 +257,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<NetBSDTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
     case llvm::Triple::RTEMS:
       return std::make_unique<RTEMSTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
-    case llvm::Triple::NaCl:
-      return std::make_unique<NaClTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
     default:
       return std::make_unique<ARMbeTargetInfo>(Triple, Opts);
     }
@@ -304,9 +299,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<FreeBSDTargetInfo<MipsTargetInfo>>(Triple, Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<MipsTargetInfo>>(Triple, Opts);
-    case llvm::Triple::NaCl:
-      return std::make_unique<NaClTargetInfo<NaClMips32TargetInfo>>(Triple,
-                                                                    Opts);
     case llvm::Triple::Win32:
       switch (Triple.getEnvironment()) {
       case llvm::Triple::GNU:
@@ -591,8 +583,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<HaikuX86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
       return std::make_unique<RTEMSX86_32TargetInfo>(Triple, Opts);
-    case llvm::Triple::NaCl:
-      return std::make_unique<NaClTargetInfo<X86_32TargetInfo>>(Triple, Opts);
     case llvm::Triple::ELFIAMCU:
       return std::make_unique<MCUX86_32TargetInfo>(Triple, Opts);
     case llvm::Triple::Hurd:
@@ -652,8 +642,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     }
     case llvm::Triple::Haiku:
       return std::make_unique<HaikuTargetInfo<X86_64TargetInfo>>(Triple, Opts);
-    case llvm::Triple::NaCl:
-      return std::make_unique<NaClTargetInfo<X86_64TargetInfo>>(Triple, Opts);
     case llvm::Triple::PS4:
       return std::make_unique<PS4OSTargetInfo<X86_64TargetInfo>>(Triple, Opts);
     case llvm::Triple::PS5:
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 65d4ed1e96540..7ff8e51f8a7f8 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -57,9 +57,6 @@ void ARMTargetInfo::setABIAAPCS() {
                     "-a:0:32"
                     "-n32"
                     "-S64");
-  } else if (T.isOSNaCl()) {
-    assert(!BigEndian && "NaCl on ARM does not support big endian");
-    resetDataLayout("e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S128");
   } else {
     resetDataLayout(BigEndian
                         ? "E-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 30d861a7ca605..42cff6540c5e3 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -865,53 +865,6 @@ class LLVM_LIBRARY_VISIBILITY WindowsTargetInfo : public OSTargetInfo<Target> {
   }
 };
 
-template <typename Target>
-class LLVM_LIBRARY_VISIBILITY NaClTargetInfo : public OSTargetInfo<Target> {
-protected:
-  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
-                    MacroBuilder &Builder) const override {
-    if (Opts.POSIXThreads)
-      Builder.defineMacro("_REENTRANT");
-    if (Opts.CPlusPlus)
-      Builder.defineMacro("_GNU_SOURCE");
-
-    DefineStd(Builder, "unix", Opts);
-    Builder.defineMacro("__native_client__");
-  }
-
-public:
-  NaClTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : OSTargetInfo<Target>(Triple, Opts) {
-    this->LongAlign = 32;
-    this->LongWidth = 32;
-    this->PointerAlign = 32;
-    this->PointerWidth = 32;
-    this->IntMaxType = TargetInfo::SignedLongLong;
-    this->Int64Type = TargetInfo::SignedLongLong;
-    this->DoubleAlign = 64;
-    this->LongDoubleWidth = 64;
-    this->LongDoubleAlign = 64;
-    this->LongLongWidth = 64;
-    this->LongLongAlign = 64;
-    this->SizeType = TargetInfo::UnsignedInt;
-    this->PtrDiffType = TargetInfo::SignedInt;
-    this->IntPtrType = TargetInfo::SignedInt;
-    // RegParmMax is inherited from the underlying architecture.
-    this->LongDoubleFormat = &llvm::APFloat::IEEEdouble();
-    if (Triple.getArch() == llvm::Triple::arm) {
-      // Handled in ARM's setABI().
-    } else if (Triple.getArch() == llvm::Triple::x86) {
-      this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-i128:128-n8:16:32-S128");
-    } else if (Triple.getArch() == llvm::Triple::x86_64) {
-      this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-i128:128-n8:16:32:64-S128");
-    } else if (Triple.getArch() == llvm::Triple::mipsel) {
-      // Handled on mips' setDataLayout.
-    }
-  }
-};
-
 // Fuchsia Target
 template <typename Target>
 class LLVM_LIBRARY_VISIBILITY FuchsiaTargetInfo : public OSTargetInfo<Target> {
diff --git a/clang/lib/Basic/Targets/PNaCl.cpp b/clang/lib/Basic/Targets/PNaCl.cpp
deleted file mode 100644
index c4adfbefb9c73..0000000000000
--- a/clang/lib/Basic/Targets/PNaCl.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//===--- PNaCl.cpp - Implement PNaCl target feature support ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements PNaCl TargetInfo objects.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PNaCl.h"
-#include "clang/Basic/MacroBuilder.h"
-
-using namespace clang;
-using namespace clang::targets;
-
-ArrayRef<const char *> PNaClTargetInfo::getGCCRegNames() const { return {}; }
-
-ArrayRef<TargetInfo::GCCRegAlias> PNaClTargetInfo::getGCCRegAliases() const {
-  return {};
-}
-
-void PNaClTargetInfo::getArchDefines(const LangOptions &Opts,
-                                     MacroBuilder &Builder) const {
-  Builder.defineMacro("__le32__");
-  Builder.defineMacro("__pnacl__");
-}
diff --git a/clang/lib/Basic/Targets/PNaCl.h b/clang/lib/Basic/Targets/PNaCl.h
deleted file mode 100644
index d162776b5a0d6..0000000000000
--- a/clang/lib/Basic/Targets/PNaCl.h
+++ /dev/null
@@ -1,90 +0,0 @@
-//===--- PNaCl.h - Declare PNaCl target feature support ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares PNaCl TargetInfo objects.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_PNACL_H
-#define LLVM_CLANG_LIB_BASIC_TARGETS_PNACL_H
-
-#include "Mips.h"
-#include "clang/Basic/TargetInfo.h"
-#include "clang/Basic/TargetOptions.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/TargetParser/Triple.h"
-
-namespace clang {
-namespace targets {
-
-class LLVM_LIBRARY_VISIBILITY PNaClTargetInfo : public TargetInfo {
-public:
-  PNaClTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : TargetInfo(Triple) {
-    this->LongAlign = 32;
-    this->LongWidth = 32;
-    this->PointerAlign = 32;
-    this->PointerWidth = 32;
-    this->IntMaxType = TargetInfo::SignedLongLong;
-    this->Int64Type = TargetInfo::SignedLongLong;
-    this->DoubleAlign = 64;
-    this->LongDoubleWidth = 64;
-    this->LongDoubleAlign = 64;
-    this->SizeType = TargetInfo::UnsignedInt;
-    this->PtrDiffType = TargetInfo::SignedInt;
-    this->IntPtrType = TargetInfo::SignedInt;
-    this->RegParmMax = 0; // Disallow regparm
-  }
-
-  void getArchDefines(const LangOptions &Opts, MacroBuilder &Builder) const;
-
-  void getTargetDefines(const LangOptions &Opts,
-                        MacroBuilder &Builder) const override {
-    getArchDefines(Opts, Builder);
-  }
-
-  bool hasFeature(StringRef Feature) const override {
-    return Feature == "pnacl";
-  }
-
-  llvm::SmallVector<Builtin::InfosShard> getTargetBuiltins() const override {
-    return {};
-  }
-
-  BuiltinVaListKind getBuiltinVaListKind() const override {
-    return TargetInfo::PNaClABIBuiltinVaList;
-  }
-
-  ArrayRef<const char *> getGCCRegNames() const override;
-
-  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
-
-  bool validateAsmConstraint(const char *&Name,
-                             TargetInfo::ConstraintInfo &Info) const override {
-    return false;
-  }
-
-  std::string_view getClobbers() const override { return ""; }
-
-  bool hasBitIntType() const override { return true; }
-};
-
-// We attempt to use PNaCl (le32) frontend and Mips32EL backend.
-class LLVM_LIBRARY_VISIBILITY NaClMips32TargetInfo : public MipsTargetInfo {
-public:
-  NaClMips32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : MipsTargetInfo(Triple, Opts) {}
-
-  BuiltinVaListKind getBuiltinVaListKind() const override {
-    return TargetInfo::PNaClABIBuiltinVaList;
-  }
-};
-} // namespace targets
-} // namespace clang
-
-#endif // LLVM_CLANG_LIB_BASIC_TARGETS_PNACL_H
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 1abf798d93129..c13b286cd7916 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -264,6 +264,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo {
     PointerWidth = PointerAlign = 32;
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
+    // SPIR32 has support for atomic ops if atomic extension is enabled.
+    // Take the maximum because it's possible the Host supports wider types.
+    MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 32);
     resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
                     "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
@@ -281,6 +284,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo {
     PointerWidth = PointerAlign = 64;
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
+    // SPIR64 has support for atomic ops if atomic extension is enabled.
+    // Take the maximum because it's possible the Host supports wider types.
+    MaxAtomicInlineWidth = std::max<unsigned char>(MaxAtomicInlineWidth, 64);
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
                     "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 5bd53ebc52ab5..f855bdad2d7c3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -348,22 +348,6 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return CIRBaseBuilderTy::createStore(loc, val, dst.getPointer(), align);
   }
 
-  mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real,
-                                  mlir::Value imag) {
-    auto resultComplexTy = cir::ComplexType::get(real.getType());
-    return create<cir::ComplexCreateOp>(loc, resultComplexTy, real, imag);
-  }
-
-  mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) {
-    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
-    return create<cir::ComplexRealOp>(loc, operandTy.getElementType(), operand);
-  }
-
-  mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
-    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
-    return create<cir::ComplexImagOp>(loc, operandTy.getElementType(), operand);
-  }
-
   /// Create a cir.complex.real_ptr operation that derives a pointer to the real
   /// part of the complex value pointed to by the specified pointer value.
   mlir::Value createComplexRealPtr(mlir::Location loc, mlir::Value value) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 72e8d71c366d8..61d1c54ee9ec9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -57,6 +57,20 @@ static RValue emitBuiltinBitOp(CIRGenFunction &cgf, const CallExpr *e,
   return RValue::get(result);
 }
 
+RValue CIRGenFunction::emitRotate(const CallExpr *e, bool isRotateLeft) {
+  mlir::Value input = emitScalarExpr(e->getArg(0));
+  mlir::Value amount = emitScalarExpr(e->getArg(1));
+
+  // TODO(cir): MSVC flavor bit rotate builtins use different types for input
+  // and amount, but cir.rotate requires them to have the same type. Cast amount
+  // to the type of input when necessary.
+  assert(!cir::MissingFeatures::msvcBuiltins());
+
+  auto r = builder.create<cir::RotateOp>(getLoc(e->getSourceRange()), input,
+                                         amount, isRotateLeft);
+  return RValue::get(r);
+}
+
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
                                        ReturnValueSlot returnValue) {
@@ -111,7 +125,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     mlir::Value real = emitScalarExpr(e->getArg(0));
     mlir::Value imag = emitScalarExpr(e->getArg(1));
     mlir::Value complex = builder.createComplexCreate(loc, real, imag);
-    return RValue::get(complex);
+    return RValue::getComplex(complex);
   }
 
   case Builtin::BI__builtin_creal:
@@ -136,6 +150,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return RValue::get(imag);
   }
 
+  case Builtin::BI__builtin_conj:
+  case Builtin::BI__builtin_conjf:
+  case Builtin::BI__builtin_conjl:
+  case Builtin::BIconj:
+  case Builtin::BIconjf:
+  case Builtin::BIconjl: {
+    mlir::Value complex = emitComplexExpr(e->getArg(0));
+    mlir::Value conj = builder.createUnaryOp(getLoc(e->getExprLoc()),
+                                             cir::UnaryOpKind::Not, complex);
+    return RValue::getComplex(conj);
+  }
+
   case Builtin::BI__builtin_clrsb:
   case Builtin::BI__builtin_clrsbl:
   case Builtin::BI__builtin_clrsbll:
@@ -219,6 +245,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     mlir::Value arg = emitScalarExpr(e->getArg(0));
     return RValue::get(builder.create<cir::BitReverseOp>(loc, arg));
   }
+
+  case Builtin::BI__builtin_rotateleft8:
+  case Builtin::BI__builtin_rotateleft16:
+  case Builtin::BI__builtin_rotateleft32:
+  case Builtin::BI__builtin_rotateleft64:
+    return emitRotate(e, /*isRotateLeft=*/true);
+
+  case Builtin::BI__builtin_rotateright8:
+  case Builtin::BI__builtin_rotateright16:
+  case Builtin::BI__builtin_rotateright32:
+  case Builtin::BI__builtin_rotateright64:
+    return emitRotate(e, /*isRotateLeft=*/false);
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index cc4a615dc392e..8667bb60d114e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -75,7 +75,7 @@ static void emitMemberInitializer(CIRGenFunction &cgf,
                                   const CXXConstructorDecl *constructor,
                                   FunctionArgList &args) {
   assert(memberInit->isAnyMemberInitializer() &&
-         "Mush have member initializer!");
+         "Must have member initializer!");
   assert(memberInit->getInit() && "Must have initializer!");
 
   assert(!cir::MissingFeatures::generateDebugInfo());
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 3273d9000771a..0a22771378ff1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -56,7 +56,28 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
   mlir::Value VisitParenExpr(ParenExpr *e);
   mlir::Value
   VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *e);
+
+  mlir::Value VisitPrePostIncDec(const UnaryOperator *e, bool isInc,
+                                 bool isPre);
+
+  mlir::Value VisitUnaryPostDec(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, false, false);
+  }
+
+  mlir::Value VisitUnaryPostInc(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, true, false);
+  }
+
+  mlir::Value VisitUnaryPreDec(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, false, true);
+  }
+
+  mlir::Value VisitUnaryPreInc(const UnaryOperator *e) {
+    return VisitPrePostIncDec(e, true, true);
+  }
+
   mlir::Value VisitUnaryDeref(const Expr *e);
+  mlir::Value VisitUnaryNot(const UnaryOperator *e);
 
   struct BinOpInfo {
     mlir::Location loc;
@@ -230,8 +251,7 @@ mlir::Value ComplexExprEmitter::VisitBinComma(const BinaryOperator *e) {
 mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
   if (e->getCallReturnType(cgf.getContext())->isReferenceType())
     return emitLoadOfLValue(e);
-
-  return cgf.emitCallExpr(e).getValue();
+  return cgf.emitCallExpr(e).getComplexValue();
 }
 
 mlir::Value ComplexExprEmitter::VisitCastExpr(CastExpr *e) {
@@ -334,10 +354,21 @@ mlir::Value ComplexExprEmitter::VisitSubstNonTypeTemplateParmExpr(
   return Visit(e->getReplacement());
 }
 
+mlir::Value ComplexExprEmitter::VisitPrePostIncDec(const UnaryOperator *e,
+                                                   bool isInc, bool isPre) {
+  LValue lv = cgf.emitLValue(e->getSubExpr());
+  return cgf.emitComplexPrePostIncDec(e, lv, isInc, isPre);
+}
+
 mlir::Value ComplexExprEmitter::VisitUnaryDeref(const Expr *e) {
   return emitLoadOfLValue(e);
 }
 
+mlir::Value ComplexExprEmitter::VisitUnaryNot(const UnaryOperator *e) {
+  mlir::Value op = Visit(e->getSubExpr());
+  return builder.createNot(op);
+}
+
 mlir::Value ComplexExprEmitter::emitPromoted(const Expr *e,
                                              QualType promotionTy) {
   e = e->IgnoreParens();
@@ -417,6 +448,29 @@ mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
   return ComplexExprEmitter(*this).Visit(const_cast<Expr *>(e));
 }
 
+mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e,
+                                                     LValue lv, bool isInc,
+                                                     bool isPre) {
+  mlir::Value inVal = emitLoadOfComplex(lv, e->getExprLoc());
+  mlir::Location loc = getLoc(e->getExprLoc());
+  auto opKind = isInc ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec;
+  mlir::Value incVal = builder.createUnaryOp(loc, opKind, inVal);
+
+  // Store the updated result through the lvalue.
+  emitStoreOfComplex(loc, incVal, lv, /*isInit=*/false);
+
+  if (getLangOpts().OpenMP)
+    cgm.errorNYI(loc, "emitComplexPrePostIncDec OpenMP");
+
+  // If this is a postinc, return the value read from memory, otherwise use the
+  // updated value.
+  return isPre ? incVal : inVal;
+}
+
+mlir::Value CIRGenFunction::emitLoadOfComplex(LValue src, SourceLocation loc) {
+  return ComplexExprEmitter(*this).emitLoadOfLValue(src, loc);
+}
+
 void CIRGenFunction::emitStoreOfComplex(mlir::Location loc, mlir::Value v,
                                         LValue dest, bool isInit) {
   ComplexExprEmitter(*this).emitStoreOfComplex(loc, v, dest, isInit);
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 1346333739bc1..9541f4f0725eb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -930,6 +930,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// returning the result.
   mlir::Value emitComplexExpr(const Expr *e);
 
+  mlir::Value emitComplexPrePostIncDec(const UnaryOperator *e, LValue lv,
+                                       bool isInc, bool isPre);
+
   LValue emitComplexAssignmentLValue(const BinaryOperator *e);
 
   void emitCompoundStmt(const clang::CompoundStmt &s);
@@ -980,6 +983,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   RValue emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc);
 
+  /// Load a complex number from the specified l-value.
+  mlir::Value emitLoadOfComplex(LValue src, SourceLocation loc);
+
   /// Given an expression that represents a value lvalue, this method emits
   /// the address of the lvalue, then loads the result as an rvalue,
   /// returning the rvalue.
@@ -1030,6 +1036,8 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitReturnStmt(const clang::ReturnStmt &s);
 
+  RValue emitRotate(const CallExpr *e, bool isRotateLeft);
+
   mlir::Value emitScalarConstant(const ConstantEmission &constant, Expr *e);
 
   /// Emit a conversion from the specified type to the specified destination
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 0a6dba5e80a62..0832c4141a10f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -58,6 +58,12 @@ class RValue {
     return value;
   }
 
+  /// Return the value of this complex value.
+  mlir::Value getComplexValue() const {
+    assert(isComplex() && "Not a complex!");
+    return value;
+  }
+
   /// Return the value of the address of the aggregate.
   Address getAggregateAddress() const {
     assert(isAggregate() && "Not an aggregate!");
diff --git a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
index 4dece5b57e450..18beca7b9a680 100644
--- a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
@@ -3,6 +3,7 @@ add_clang_library(MLIRCIRTransforms
   CIRSimplify.cpp
   FlattenCFG.cpp
   HoistAllocas.cpp
+  LoweringPrepare.cpp
 
   DEPENDS
   MLIRCIRPassIncGen
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
new file mode 100644
index 0000000000000..8f848c7345610
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -0,0 +1,95 @@
+//===- LoweringPrepare.cpp - pareparation work for LLVM lowering ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIROpsEnums.h"
+#include "clang/CIR/Dialect/Passes.h"
+
+#include <memory>
+
+using namespace mlir;
+using namespace cir;
+
+namespace {
+struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
+  LoweringPreparePass() = default;
+  void runOnOperation() override;
+
+  void runOnOp(mlir::Operation *op);
+  void lowerUnaryOp(cir::UnaryOp op);
+};
+
+} // namespace
+
+void LoweringPreparePass::lowerUnaryOp(cir::UnaryOp op) {
+  mlir::Type ty = op.getType();
+  if (!mlir::isa<cir::ComplexType>(ty))
+    return;
+
+  mlir::Location loc = op.getLoc();
+  cir::UnaryOpKind opKind = op.getKind();
+
+  CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointAfter(op);
+
+  mlir::Value operand = op.getInput();
+  mlir::Value operandReal = builder.createComplexReal(loc, operand);
+  mlir::Value operandImag = builder.createComplexImag(loc, operand);
+
+  mlir::Value resultReal;
+  mlir::Value resultImag;
+
+  switch (opKind) {
+  case cir::UnaryOpKind::Inc:
+  case cir::UnaryOpKind::Dec:
+    resultReal = builder.createUnaryOp(loc, opKind, operandReal);
+    resultImag = operandImag;
+    break;
+
+  case cir::UnaryOpKind::Plus:
+  case cir::UnaryOpKind::Minus:
+    llvm_unreachable("Complex unary Plus/Minus NYI");
+    break;
+
+  case cir::UnaryOpKind::Not:
+    resultReal = operandReal;
+    resultImag =
+        builder.createUnaryOp(loc, cir::UnaryOpKind::Minus, operandImag);
+    break;
+  }
+
+  mlir::Value result = builder.createComplexCreate(loc, resultReal, resultImag);
+  op.replaceAllUsesWith(result);
+  op.erase();
+}
+
+void LoweringPreparePass::runOnOp(mlir::Operation *op) {
+  if (auto unary = dyn_cast<cir::UnaryOp>(op))
+    lowerUnaryOp(unary);
+}
+
+void LoweringPreparePass::runOnOperation() {
+  mlir::Operation *op = getOperation();
+
+  llvm::SmallVector<mlir::Operation *> opsToTransform;
+
+  op->walk([&](mlir::Operation *op) {
+    if (mlir::isa<cir::UnaryOp>(op))
+      opsToTransform.push_back(op);
+  });
+
+  for (mlir::Operation *o : opsToTransform)
+    runOnOp(o);
+}
+
+std::unique_ptr<Pass> mlir::createLoweringPreparePass() {
+  return std::make_unique<LoweringPreparePass>();
+}
diff --git a/clang/lib/CIR/Lowering/CIRPasses.cpp b/clang/lib/CIR/Lowering/CIRPasses.cpp
index 7a581939580a9..5607abc98e319 100644
--- a/clang/lib/CIR/Lowering/CIRPasses.cpp
+++ b/clang/lib/CIR/Lowering/CIRPasses.cpp
@@ -31,6 +31,8 @@ mlir::LogicalResult runCIRToCIRPasses(mlir::ModuleOp theModule,
   if (enableCIRSimplify)
     pm.addPass(mlir::createCIRSimplifyPass());
 
+  pm.addPass(mlir::createLoweringPreparePass());
+
   pm.enableVerifier(enableVerifier);
   (void)mlir::applyPassManagerCLOptions(pm);
   return pm.run(theModule);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 7dcea0c8eb529..840e856ba0cf8 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -872,6 +872,21 @@ mlir::LogicalResult CIRToLLVMReturnOpLowering::matchAndRewrite(
   return mlir::LogicalResult::success();
 }
 
+mlir::LogicalResult CIRToLLVMRotateOpLowering::matchAndRewrite(
+    cir::RotateOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  // Note that LLVM intrinsic calls to @llvm.fsh{r,l}.i* have the same type as
+  // the operand.
+  mlir::Value input = adaptor.getInput();
+  if (op.isRotateLeft())
+    rewriter.replaceOpWithNewOp<mlir::LLVM::FshlOp>(op, input, input,
+                                                    adaptor.getAmount());
+  else
+    rewriter.replaceOpWithNewOp<mlir::LLVM::FshrOp>(op, input, input,
+                                                    adaptor.getAmount());
+  return mlir::LogicalResult::success();
+}
+
 static mlir::LogicalResult
 rewriteCallOrInvoke(mlir::Operation *op, mlir::ValueRange callOperands,
                     mlir::ConversionPatternRewriter &rewriter,
@@ -2077,6 +2092,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMGetBitfieldOpLowering,
                CIRToLLVMGetGlobalOpLowering,
                CIRToLLVMGetMemberOpLowering,
+               CIRToLLVMRotateOpLowering,
                CIRToLLVMSelectOpLowering,
                CIRToLLVMSetBitfieldOpLowering,
                CIRToLLVMShiftOpLowering,
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 3c30b1bc5b072..3faf1e900848e 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -160,6 +160,16 @@ class CIRToLLVMReturnOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMRotateOpLowering
+    : public mlir::OpConversionPattern<cir::RotateOp> {
+public:
+  using mlir::OpConversionPattern<cir::RotateOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::RotateOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMCallOpLowering : public mlir::OpConversionPattern<cir::CallOp> {
 public:
   using mlir::OpConversionPattern<cir::CallOp>::OpConversionPattern;
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 2f6d4c414e737..1b7257857dd3b 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -407,13 +407,13 @@ static bool initTargetOptions(const CompilerInstance &CI,
   // Set EABI version.
   Options.EABIVersion = TargetOpts.EABIVersion;
 
-  if (LangOpts.hasSjLjExceptions())
+  if (CodeGenOpts.hasSjLjExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::SjLj;
-  if (LangOpts.hasSEHExceptions())
+  if (CodeGenOpts.hasSEHExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::WinEH;
-  if (LangOpts.hasDWARFExceptions())
+  if (CodeGenOpts.hasDWARFExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::DwarfCFI;
-  if (LangOpts.hasWasmExceptions())
+  if (CodeGenOpts.hasWasmExceptions())
     Options.ExceptionModel = llvm::ExceptionHandling::Wasm;
 
   Options.NoInfsFPMath = LangOpts.NoHonorInfs;
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index f3ddf7bf9a463..0e80522536e15 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -853,9 +853,24 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
       offset += size;
       index++;
     };
+    auto addSignedHeaderField =
+        [&](llvm::Value *Value, const PointerAuthSchema &Schema,
+            GlobalDecl Decl, QualType Type, CharUnits Size, const Twine &Name) {
+          auto StorageAddress = projectField(index, Name);
+          if (Schema) {
+            auto AuthInfo = EmitPointerAuthInfo(
+                Schema, StorageAddress.emitRawPointer(*this), Decl, Type);
+            Value = EmitPointerAuthSign(AuthInfo, Value);
+          }
+          Builder.CreateStore(Value, StorageAddress);
+          offset += Size;
+          index++;
+        };
 
     if (!IsOpenCL) {
-      addHeaderField(isa, getPointerSize(), "block.isa");
+      addSignedHeaderField(
+          isa, CGM.getCodeGenOpts().PointerAuth.ObjCIsaPointers, GlobalDecl(),
+          QualType(), getPointerSize(), "block.isa");
       addHeaderField(llvm::ConstantInt::get(IntTy, flags.getBitMask()),
                      getIntSize(), "block.flags");
       addHeaderField(llvm::ConstantInt::get(IntTy, 0), getIntSize(),
@@ -1285,7 +1300,9 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
     if (IsWindows)
       fields.addNullPointer(CGM.Int8PtrPtrTy);
     else
-      fields.add(CGM.getNSConcreteGlobalBlock());
+      fields.addSignedPointer(CGM.getNSConcreteGlobalBlock(),
+                              CGM.getCodeGenOpts().PointerAuth.ObjCIsaPointers,
+                              GlobalDecl(), QualType());
 
     // __flags
     BlockFlags flags = BLOCK_IS_GLOBAL;
@@ -1532,8 +1549,8 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction(
   llvm::BasicBlock *resume = Builder.GetInsertBlock();
 
   // Go back to the entry.
-  if (entry_ptr->getNextNonDebugInstruction())
-    entry_ptr = entry_ptr->getNextNonDebugInstruction()->getIterator();
+  if (entry_ptr->getNextNode())
+    entry_ptr = entry_ptr->getNextNode()->getIterator();
   else
     entry_ptr = entry->end();
   Builder.SetInsertPoint(entry, entry_ptr);
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 0fc488e98aaf0..117ef3d16e21b 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -707,11 +707,15 @@ struct GetReturnObjectManager {
     Builder.CreateStore(Builder.getFalse(), GroActiveFlag);
 
     GroEmission = CGF.EmitAutoVarAlloca(*GroVarDecl);
-    auto *GroAlloca = dyn_cast_or_null<llvm::AllocaInst>(
-        GroEmission.getOriginalAllocatedAddress().getPointer());
-    assert(GroAlloca && "expected alloca to be emitted");
-    GroAlloca->setMetadata(llvm::LLVMContext::MD_coro_outside_frame,
-                           llvm::MDNode::get(CGF.CGM.getLLVMContext(), {}));
+
+    if (!GroVarDecl->isNRVOVariable()) {
+      // NRVO variables don't have allocas and won't have the same issue.
+      auto *GroAlloca = dyn_cast_or_null<llvm::AllocaInst>(
+          GroEmission.getOriginalAllocatedAddress().getPointer());
+      assert(GroAlloca && "expected alloca to be emitted");
+      GroAlloca->setMetadata(llvm::LLVMContext::MD_coro_outside_frame,
+                             llvm::MDNode::get(CGF.CGM.getLLVMContext(), {}));
+    }
 
     // Remember the top of EHStack before emitting the cleanup.
     auto old_top = CGF.EHStack.stable_begin();
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f97c7b6445984..446cf8d9e05c6 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -170,6 +170,10 @@ void CGDebugInfo::addInstToSpecificSourceAtom(llvm::Instruction *KeyInstruction,
   if (!Group || !CGM.getCodeGenOpts().DebugKeyInstructions)
     return;
 
+  llvm::DISubprogram *SP = KeyInstruction->getFunction()->getSubprogram();
+  if (!SP || !SP->getKeyInstructionsEnabled())
+    return;
+
   addInstSourceAtomMetadata(KeyInstruction, Group, /*Rank=*/1);
 
   llvm::Instruction *BackupI =
@@ -799,7 +803,8 @@ void CGDebugInfo::CreateCompileUnit() {
   // Create new compile unit.
   TheCU = DBuilder.createCompileUnit(
       LangTag, CUFile, CGOpts.EmitVersionIdentMetadata ? Producer : "",
-      LO.Optimize || CGOpts.PrepareForLTO || CGOpts.PrepareForThinLTO,
+      CGOpts.OptimizationLevel != 0 || CGOpts.PrepareForLTO ||
+          CGOpts.PrepareForThinLTO,
       CGOpts.DwarfDebugFlags, RuntimeVers, CGOpts.SplitDwarfFile, EmissionKind,
       DwoId, CGOpts.SplitDwarfInlining, CGOpts.DebugInfoForProfiling,
       NameTableKind, CGOpts.DebugRangesBaseAddress, remapDIPath(Sysroot), SDK);
@@ -2284,7 +2289,7 @@ llvm::DISubprogram *CGDebugInfo::CreateCXXMemberFunction(
     Flags |= llvm::DINode::FlagRValueReference;
   if (!Method->isExternallyVisible())
     SPFlags |= llvm::DISubprogram::SPFlagLocalToUnit;
-  if (CGM.getLangOpts().Optimize)
+  if (CGM.getCodeGenOpts().OptimizationLevel != 0)
     SPFlags |= llvm::DISubprogram::SPFlagOptimized;
 
   // In this debug mode, emit type info for a class when its constructor type
@@ -4355,7 +4360,7 @@ llvm::DISubprogram *CGDebugInfo::getFunctionFwdDeclOrStub(GlobalDecl GD,
       FD->getReturnType(), ArgTypes, FunctionProtoType::ExtProtoInfo(CC));
   if (!FD->isExternallyVisible())
     SPFlags |= llvm::DISubprogram::SPFlagLocalToUnit;
-  if (CGM.getLangOpts().Optimize)
+  if (CGM.getCodeGenOpts().OptimizationLevel != 0)
     SPFlags |= llvm::DISubprogram::SPFlagOptimized;
 
   if (Stub) {
@@ -4685,7 +4690,7 @@ void CGDebugInfo::emitFunctionStart(GlobalDecl GD, SourceLocation Loc,
 
   if (Fn->hasLocalLinkage())
     SPFlags |= llvm::DISubprogram::SPFlagLocalToUnit;
-  if (CGM.getLangOpts().Optimize)
+  if (CGM.getCodeGenOpts().OptimizationLevel != 0)
     SPFlags |= llvm::DISubprogram::SPFlagOptimized;
 
   llvm::DINode::DIFlags FlagsForDef = Flags | getCallSiteRelatedAttrs();
@@ -4768,7 +4773,7 @@ void CGDebugInfo::EmitFunctionDecl(GlobalDecl GD, SourceLocation Loc,
   unsigned LineNo = getLineNumber(Loc);
   unsigned ScopeLine = 0;
   llvm::DISubprogram::DISPFlags SPFlags = llvm::DISubprogram::SPFlagZero;
-  if (CGM.getLangOpts().Optimize)
+  if (CGM.getCodeGenOpts().OptimizationLevel != 0)
     SPFlags |= llvm::DISubprogram::SPFlagOptimized;
 
   llvm::DINodeArray Annotations = CollectBTFDeclTagAnnotations(D);
@@ -5100,7 +5105,8 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const VarDecl *VD,
         // Use VarDecl's Tag, Scope and Line number.
         auto FieldAlign = getDeclAlignIfRequired(Field, CGM.getContext());
         auto *D = DBuilder.createAutoVariable(
-            Scope, FieldName, Unit, Line, FieldTy, CGM.getLangOpts().Optimize,
+            Scope, FieldName, Unit, Line, FieldTy,
+            CGM.getCodeGenOpts().OptimizationLevel != 0,
             Flags | llvm::DINode::FlagArtificial, FieldAlign);
 
         // Insert an llvm.dbg.declare into the current block.
@@ -5126,9 +5132,9 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const VarDecl *VD,
   llvm::DILocalVariable *D = nullptr;
   if (ArgNo) {
     llvm::DINodeArray Annotations = CollectBTFDeclTagAnnotations(VD);
-    D = DBuilder.createParameterVariable(Scope, Name, *ArgNo, Unit, Line, Ty,
-                                         CGM.getLangOpts().Optimize, Flags,
-                                         Annotations);
+    D = DBuilder.createParameterVariable(
+        Scope, Name, *ArgNo, Unit, Line, Ty,
+        CGM.getCodeGenOpts().OptimizationLevel != 0, Flags, Annotations);
   } else {
     // For normal local variable, we will try to find out whether 'VD' is the
     // copy parameter of coroutine.
@@ -5169,8 +5175,9 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const VarDecl *VD,
     D = RemapCoroArgToLocalVar();
     // Or we will create a new DIVariable for this Decl if D dose not exists.
     if (!D)
-      D = DBuilder.createAutoVariable(Scope, Name, Unit, Line, Ty,
-                                      CGM.getLangOpts().Optimize, Flags, Align);
+      D = DBuilder.createAutoVariable(
+          Scope, Name, Unit, Line, Ty,
+          CGM.getCodeGenOpts().OptimizationLevel != 0, Flags, Align);
   }
   // Insert an llvm.dbg.declare into the current block.
   DBuilder.insertDeclare(Storage, D, DBuilder.createExpression(Expr),
@@ -5224,7 +5231,7 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const BindingDecl *BD,
   auto *Scope = cast<llvm::DIScope>(LexicalBlockStack.back());
   // Create the descriptor for the variable.
   llvm::DILocalVariable *D = DBuilder.createAutoVariable(
-      Scope, Name, Unit, Line, Ty, CGM.getLangOpts().Optimize,
+      Scope, Name, Unit, Line, Ty, CGM.getCodeGenOpts().OptimizationLevel != 0,
       llvm::DINode::FlagZero, Align);
 
   if (const MemberExpr *ME = dyn_cast<MemberExpr>(BD->getBinding())) {
@@ -5322,9 +5329,10 @@ void CGDebugInfo::EmitLabel(const LabelDecl *D, CGBuilderTy &Builder) {
   StringRef Name = D->getName();
 
   // Create the descriptor for the label.
-  auto *L = DBuilder.createLabel(
-      Scope, Name, Unit, Line, Column, /*IsArtificial=*/false,
-      /*CoroSuspendIdx=*/std::nullopt, CGM.getLangOpts().Optimize);
+  auto *L = DBuilder.createLabel(Scope, Name, Unit, Line, Column,
+                                 /*IsArtificial=*/false,
+                                 /*CoroSuspendIdx=*/std::nullopt,
+                                 CGM.getCodeGenOpts().OptimizationLevel != 0);
 
   // Insert an llvm.dbg.label into the current block.
   DBuilder.insertLabel(L,
@@ -5587,7 +5595,8 @@ void CGDebugInfo::EmitDeclareOfBlockLiteralArgVariable(const CGBlockInfo &block,
 
   // Create the descriptor for the parameter.
   auto *debugVar = DBuilder.createParameterVariable(
-      scope, Name, ArgNo, tunit, line, type, CGM.getLangOpts().Optimize, flags);
+      scope, Name, ArgNo, tunit, line, type,
+      CGM.getCodeGenOpts().OptimizationLevel != 0, flags);
 
   // Insert an llvm.dbg.declare into the current block.
   DBuilder.insertDeclare(Alloca, debugVar, DBuilder.createExpression(),
@@ -6362,7 +6371,7 @@ llvm::DebugLoc CGDebugInfo::SourceLocToDebugLoc(SourceLocation Loc) {
 llvm::DINode::DIFlags CGDebugInfo::getCallSiteRelatedAttrs() const {
   // Call site-related attributes are only useful in optimized programs, and
   // when there's a possibility of debugging backtraces.
-  if (!CGM.getLangOpts().Optimize ||
+  if (CGM.getCodeGenOpts().OptimizationLevel == 0 ||
       DebugKind == llvm::codegenoptions::NoDebugInfo ||
       DebugKind == llvm::codegenoptions::LocTrackingOnly)
     return llvm::DINode::FlagZero;
@@ -6471,24 +6480,27 @@ SanitizerOrdinalToCheckLabel(SanitizerKind::SanitizerOrdinal Ordinal) {
 llvm::DILocation *CodeGenFunction::SanitizerAnnotateDebugInfo(
     ArrayRef<SanitizerKind::SanitizerOrdinal> Ordinals,
     SanitizerHandler Handler) {
+  llvm::DILocation *CheckDebugLoc = Builder.getCurrentDebugLocation();
+  auto *DI = getDebugInfo();
+  if (!DI)
+    return CheckDebugLoc;
+
   std::string Label;
   if (Ordinals.size() == 1)
     Label = SanitizerOrdinalToCheckLabel(Ordinals[0]);
   else
     Label = SanitizerHandlerToCheckLabel(Handler);
 
-  llvm::DILocation *CheckDI = Builder.getCurrentDebugLocation();
-
   for (auto Ord : Ordinals) {
     // TODO: deprecate ClArrayBoundsPseudoFn
     if (((ClArrayBoundsPseudoFn && Ord == SanitizerKind::SO_ArrayBounds) ||
          CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo.has(Ord)) &&
-        CheckDI) {
-      return getDebugInfo()->CreateSyntheticInlineAt(CheckDI, Label);
+        CheckDebugLoc) {
+      return DI->CreateSyntheticInlineAt(CheckDebugLoc, Label);
     }
   }
 
-  return CheckDI;
+  return CheckDebugLoc;
 }
 
 SanitizerDebugLocation::SanitizerDebugLocation(
diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index ad138b9876e8c..f86af4581c345 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -131,20 +131,21 @@ const EHPersonality EHPersonality::ZOS_CPlusPlus = {"__zos_cxx_personality_v2",
                                                     nullptr};
 
 static const EHPersonality &getCPersonality(const TargetInfo &Target,
-                                            const LangOptions &L) {
+                                            const CodeGenOptions &CGOpts) {
   const llvm::Triple &T = Target.getTriple();
   if (T.isWindowsMSVCEnvironment())
     return EHPersonality::MSVC_CxxFrameHandler3;
-  if (L.hasSjLjExceptions())
+  if (CGOpts.hasSjLjExceptions())
     return EHPersonality::GNU_C_SJLJ;
-  if (L.hasDWARFExceptions())
+  if (CGOpts.hasDWARFExceptions())
     return EHPersonality::GNU_C;
-  if (L.hasSEHExceptions())
+  if (CGOpts.hasSEHExceptions())
     return EHPersonality::GNU_C_SEH;
   return EHPersonality::GNU_C;
 }
 
 static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
+                                               const CodeGenOptions &CGOpts,
                                                const LangOptions &L) {
   const llvm::Triple &T = Target.getTriple();
   if (T.isWindowsMSVCEnvironment())
@@ -152,7 +153,7 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
 
   switch (L.ObjCRuntime.getKind()) {
   case ObjCRuntime::FragileMacOSX:
-    return getCPersonality(Target, L);
+    return getCPersonality(Target, CGOpts);
   case ObjCRuntime::MacOSX:
   case ObjCRuntime::iOS:
   case ObjCRuntime::WatchOS:
@@ -165,9 +166,9 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
     [[fallthrough]];
   case ObjCRuntime::GCC:
   case ObjCRuntime::ObjFW:
-    if (L.hasSjLjExceptions())
+    if (CGOpts.hasSjLjExceptions())
       return EHPersonality::GNU_ObjC_SJLJ;
-    if (L.hasSEHExceptions())
+    if (CGOpts.hasSEHExceptions())
       return EHPersonality::GNU_ObjC_SEH;
     return EHPersonality::GNU_ObjC;
   }
@@ -175,19 +176,19 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target,
 }
 
 static const EHPersonality &getCXXPersonality(const TargetInfo &Target,
-                                              const LangOptions &L) {
+                                              const CodeGenOptions &CGOpts) {
   const llvm::Triple &T = Target.getTriple();
   if (T.isWindowsMSVCEnvironment())
     return EHPersonality::MSVC_CxxFrameHandler3;
   if (T.isOSAIX())
     return EHPersonality::XL_CPlusPlus;
-  if (L.hasSjLjExceptions())
+  if (CGOpts.hasSjLjExceptions())
     return EHPersonality::GNU_CPlusPlus_SJLJ;
-  if (L.hasDWARFExceptions())
+  if (CGOpts.hasDWARFExceptions())
     return EHPersonality::GNU_CPlusPlus;
-  if (L.hasSEHExceptions())
+  if (CGOpts.hasSEHExceptions())
     return EHPersonality::GNU_CPlusPlus_SEH;
-  if (L.hasWasmExceptions())
+  if (CGOpts.hasWasmExceptions())
     return EHPersonality::GNU_Wasm_CPlusPlus;
   if (T.isOSzOS())
     return EHPersonality::ZOS_CPlusPlus;
@@ -197,6 +198,7 @@ static const EHPersonality &getCXXPersonality(const TargetInfo &Target,
 /// Determines the personality function to use when both C++
 /// and Objective-C exceptions are being caught.
 static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
+                                                 const CodeGenOptions &CGOpts,
                                                  const LangOptions &L) {
   if (Target.getTriple().isWindowsMSVCEnvironment())
     return EHPersonality::MSVC_CxxFrameHandler3;
@@ -205,7 +207,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
   // In the fragile ABI, just use C++ exception handling and hope
   // they're not doing crazy exception mixing.
   case ObjCRuntime::FragileMacOSX:
-    return getCXXPersonality(Target, L);
+    return getCXXPersonality(Target, CGOpts);
 
   // The ObjC personality defers to the C++ personality for non-ObjC
   // handlers.  Unlike the C++ case, we use the same personality
@@ -213,7 +215,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
   case ObjCRuntime::MacOSX:
   case ObjCRuntime::iOS:
   case ObjCRuntime::WatchOS:
-    return getObjCPersonality(Target, L);
+    return getObjCPersonality(Target, CGOpts, L);
 
   case ObjCRuntime::GNUstep:
     return Target.getTriple().isOSCygMing() ? EHPersonality::GNU_CPlusPlus_SEH
@@ -223,7 +225,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target,
   // mixed EH.  Use the ObjC personality just to avoid returning null.
   case ObjCRuntime::GCC:
   case ObjCRuntime::ObjFW:
-    return getObjCPersonality(Target, L);
+    return getObjCPersonality(Target, CGOpts, L);
   }
   llvm_unreachable("bad runtime kind");
 }
@@ -237,6 +239,7 @@ static const EHPersonality &getSEHPersonalityMSVC(const llvm::Triple &T) {
 const EHPersonality &EHPersonality::get(CodeGenModule &CGM,
                                         const FunctionDecl *FD) {
   const llvm::Triple &T = CGM.getTarget().getTriple();
+  const CodeGenOptions &CGOpts = CGM.getCodeGenOpts();
   const LangOptions &L = CGM.getLangOpts();
   const TargetInfo &Target = CGM.getTarget();
 
@@ -245,10 +248,10 @@ const EHPersonality &EHPersonality::get(CodeGenModule &CGM,
     return getSEHPersonalityMSVC(T);
 
   if (L.ObjC)
-    return L.CPlusPlus ? getObjCXXPersonality(Target, L)
-                       : getObjCPersonality(Target, L);
-  return L.CPlusPlus ? getCXXPersonality(Target, L)
-                     : getCPersonality(Target, L);
+    return L.CPlusPlus ? getObjCXXPersonality(Target, CGOpts, L)
+                       : getObjCPersonality(Target, CGOpts, L);
+  return L.CPlusPlus ? getCXXPersonality(Target, CGOpts)
+                     : getCPersonality(Target, CGOpts);
 }
 
 const EHPersonality &EHPersonality::get(CodeGenFunction &CGF) {
@@ -344,7 +347,7 @@ void CodeGenModule::SimplifyPersonality() {
     return;
 
   const EHPersonality &ObjCXX = EHPersonality::get(*this, /*FD=*/nullptr);
-  const EHPersonality &CXX = getCXXPersonality(getTarget(), LangOpts);
+  const EHPersonality &CXX = getCXXPersonality(getTarget(), CodeGenOpts);
   if (&ObjCXX == &CXX)
     return;
 
@@ -500,7 +503,7 @@ void CodeGenFunction::EmitStartEHSpec(const Decl *D) {
     // In Wasm EH we currently treat 'throw()' in the same way as 'noexcept'. In
     // case of throw with types, we ignore it and print a warning for now.
     // TODO Correctly handle exception specification in Wasm EH
-    if (CGM.getLangOpts().hasWasmExceptions()) {
+    if (CGM.getCodeGenOpts().hasWasmExceptions()) {
       if (EST == EST_DynamicNone)
         EHStack.pushTerminate();
       else
@@ -515,8 +518,8 @@ void CodeGenFunction::EmitStartEHSpec(const Decl *D) {
     // throw with types.
     // TODO Correctly handle exception specification in Emscripten EH
     if (getTarget().getCXXABI() == TargetCXXABI::WebAssembly &&
-        CGM.getLangOpts().getExceptionHandling() ==
-            LangOptions::ExceptionHandlingKind::None &&
+        CGM.getCodeGenOpts().getExceptionHandling() ==
+            CodeGenOptions::ExceptionHandlingKind::None &&
         EST == EST_Dynamic)
       CGM.getDiags().Report(D->getLocation(),
                             diag::warn_wasm_dynamic_exception_spec_ignored)
@@ -604,7 +607,7 @@ void CodeGenFunction::EmitEndEHSpec(const Decl *D) {
     // In wasm we currently treat 'throw()' in the same way as 'noexcept'. In
     // case of throw with types, we ignore it and print a warning for now.
     // TODO Correctly handle exception specification in wasm
-    if (CGM.getLangOpts().hasWasmExceptions()) {
+    if (CGM.getCodeGenOpts().hasWasmExceptions()) {
       if (EST == EST_DynamicNone)
         EHStack.popTerminate();
       return;
diff --git a/clang/lib/CodeGen/CGLoopInfo.cpp b/clang/lib/CodeGen/CGLoopInfo.cpp
index 4a9092842858b..b2b569a43038c 100644
--- a/clang/lib/CodeGen/CGLoopInfo.cpp
+++ b/clang/lib/CodeGen/CGLoopInfo.cpp
@@ -221,18 +221,6 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs,
     return createUnrollAndJamMetadata(Attrs, LoopProperties, HasUserTransforms);
   }
 
-  // Apply all loop properties to the vectorized loop.
-  SmallVector<Metadata *, 4> FollowupLoopProperties;
-  FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end());
-
-  // Don't vectorize an already vectorized loop.
-  FollowupLoopProperties.push_back(
-      MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized")));
-
-  bool FollowupHasTransforms = false;
-  SmallVector<Metadata *, 4> Followup = createUnrollAndJamMetadata(
-      Attrs, FollowupLoopProperties, FollowupHasTransforms);
-
   SmallVector<Metadata *, 4> Args;
   Args.append(LoopProperties.begin(), LoopProperties.end());
 
@@ -286,22 +274,46 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs,
   // 5) it is implied when vectorize.width is unset (0) and the user
   //    explicitly requested fixed-width vectorization, i.e.
   //    vectorize.scalable.enable is false.
+  bool VectorizeEnabled = false;
   if (Attrs.VectorizeEnable != LoopAttributes::Unspecified ||
       (IsVectorPredicateEnabled && Attrs.VectorizeWidth != 1) ||
       Attrs.VectorizeWidth > 1 ||
       Attrs.VectorizeScalable == LoopAttributes::Enable ||
       (Attrs.VectorizeScalable == LoopAttributes::Disable &&
        Attrs.VectorizeWidth != 1)) {
-    bool AttrVal = Attrs.VectorizeEnable != LoopAttributes::Disable;
+    VectorizeEnabled = Attrs.VectorizeEnable != LoopAttributes::Disable;
     Args.push_back(
         MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
                           ConstantAsMetadata::get(ConstantInt::get(
-                              llvm::Type::getInt1Ty(Ctx), AttrVal))}));
+                              llvm::Type::getInt1Ty(Ctx), VectorizeEnabled))}));
   }
 
-  if (FollowupHasTransforms)
-    Args.push_back(
-        createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup));
+  // Apply all loop properties to the vectorized loop.
+  SmallVector<Metadata *, 4> FollowupLoopProperties;
+
+  // If vectorization is not explicitly enabled, the follow-up metadata will be
+  // directly appended to the list currently being created. In that case, adding
+  // LoopProperties to FollowupLoopProperties would result in duplication.
+  if (VectorizeEnabled)
+    FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end());
+
+  // Don't vectorize an already vectorized loop.
+  FollowupLoopProperties.push_back(
+      MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized")));
+
+  bool FollowupHasTransforms = false;
+  SmallVector<Metadata *, 4> Followup = createUnrollAndJamMetadata(
+      Attrs, FollowupLoopProperties, FollowupHasTransforms);
+
+  if (FollowupHasTransforms) {
+    // If vectorization is explicitly enabled, we create a follow-up metadata,
+    // otherwise directly add the contents of it to Args.
+    if (VectorizeEnabled)
+      Args.push_back(
+          createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup));
+    else
+      Args.append(Followup.begin(), Followup.end());
+  }
 
   HasUserTransforms = true;
   return Args;
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 6f87444d3f672..24b6ce7c1c70d 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -1193,16 +1193,23 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
     ivarAddr = ivarAddr.withElementType(bitcastType);
     llvm::LoadInst *load = Builder.CreateLoad(ivarAddr, "load");
     load->setAtomic(llvm::AtomicOrdering::Unordered);
+    llvm::Value *ivarVal = load;
+    if (PointerAuthQualifier PAQ = ivar->getType().getPointerAuth()) {
+      CGPointerAuthInfo SrcInfo = EmitPointerAuthInfo(PAQ, ivarAddr);
+      CGPointerAuthInfo TargetInfo =
+          CGM.getPointerAuthInfoForType(getterMethod->getReturnType());
+      ivarVal = emitPointerAuthResign(ivarVal, ivar->getType(), SrcInfo,
+                                      TargetInfo, /*isKnownNonNull=*/false);
+    }
 
     // Store that value into the return address.  Doing this with a
     // bitcast is likely to produce some pretty ugly IR, but it's not
     // the *most* terrible thing in the world.
     llvm::Type *retTy = ConvertType(getterMethod->getReturnType());
     uint64_t retTySize = CGM.getDataLayout().getTypeSizeInBits(retTy);
-    llvm::Value *ivarVal = load;
     if (ivarSize > retTySize) {
       bitcastType = llvm::Type::getIntNTy(getLLVMContext(), retTySize);
-      ivarVal = Builder.CreateTrunc(load, bitcastType);
+      ivarVal = Builder.CreateTrunc(ivarVal, bitcastType);
     }
     Builder.CreateStore(ivarVal, ReturnValue.withElementType(bitcastType));
 
@@ -1214,6 +1221,16 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
   case PropertyImplStrategy::GetSetProperty: {
     llvm::FunctionCallee getPropertyFn =
         CGM.getObjCRuntime().GetPropertyGetFunction();
+
+    if (ivar->getType().getPointerAuth()) {
+      // This currently cannot be hit, but if we ever allow objc pointers
+      // to be signed, this will become possible. Reaching here would require
+      // a copy, weak, etc property backed by an authenticated pointer.
+      CGM.ErrorUnsupported(propImpl,
+                           "Obj-C getter requiring pointer authentication");
+      return;
+    }
+
     if (!getPropertyFn) {
       CGM.ErrorUnsupported(propImpl, "Obj-C getter requiring atomic copy");
       return;
@@ -1269,7 +1286,9 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
     LValue LV = EmitLValueForIvar(TypeOfSelfObject(), LoadObjCSelf(), ivar, 0);
 
     QualType ivarType = ivar->getType();
-    switch (getEvaluationKind(ivarType)) {
+    auto EvaluationKind = getEvaluationKind(ivarType);
+    assert(!ivarType.getPointerAuth() || EvaluationKind == TEK_Scalar);
+    switch (EvaluationKind) {
     case TEK_Complex: {
       ComplexPairTy pair = EmitLoadOfComplex(LV, SourceLocation());
       EmitStoreOfComplex(pair, MakeAddrLValue(ReturnValue, ivarType),
@@ -1287,6 +1306,11 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
     case TEK_Scalar: {
       llvm::Value *value;
       if (propType->isReferenceType()) {
+        if (ivarType.getPointerAuth()) {
+          CGM.ErrorUnsupported(propImpl,
+                               "Obj-C getter for authenticated reference type");
+          return;
+        }
         value = LV.getAddress().emitRawPointer(*this);
       } else {
         // We want to load and autoreleaseReturnValue ARC __weak ivars.
@@ -1300,7 +1324,19 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
         // Otherwise we want to do a simple load, suppressing the
         // final autorelease.
         } else {
-          value = EmitLoadOfLValue(LV, SourceLocation()).getScalarVal();
+          if (PointerAuthQualifier PAQ = ivar->getType().getPointerAuth()) {
+            Address ivarAddr = LV.getAddress();
+            llvm::LoadInst *LoadInst = Builder.CreateLoad(ivarAddr, "load");
+            llvm::Value *Load = LoadInst;
+            auto SrcInfo = EmitPointerAuthInfo(PAQ, ivarAddr);
+            auto TargetInfo =
+                CGM.getPointerAuthInfoForType(getterMethod->getReturnType());
+            Load = emitPointerAuthResign(Load, ivarType, SrcInfo, TargetInfo,
+                                         /*isKnownNonNull=*/false);
+            value = Load;
+          } else
+            value = EmitLoadOfLValue(LV, SourceLocation()).getScalarVal();
+
           AutoreleaseResult = false;
         }
 
@@ -1490,6 +1526,14 @@ CodeGenFunction::generateObjCSetterBody(const ObjCImplementationDecl *classImpl,
 
     llvm::Value *load = Builder.CreateLoad(argAddr);
 
+    if (PointerAuthQualifier PAQ = ivar->getType().getPointerAuth()) {
+      QualType PropertyType = propImpl->getPropertyDecl()->getType();
+      CGPointerAuthInfo SrcInfo = CGM.getPointerAuthInfoForType(PropertyType);
+      CGPointerAuthInfo TargetInfo = EmitPointerAuthInfo(PAQ, ivarAddr);
+      load = emitPointerAuthResign(load, ivar->getType(), SrcInfo, TargetInfo,
+                                   /*isKnownNonNull=*/false);
+    }
+
     // Perform an atomic store.  There are no memory ordering requirements.
     llvm::StoreInst *store = Builder.CreateStore(load, ivarAddr);
     store->setAtomic(llvm::AtomicOrdering::Unordered);
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index d828702cbb87f..8acf8d2ddec02 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -1942,8 +1942,9 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
     // struct objc_class *sibling_class
     classFields.addNullPointer(PtrTy);
     // struct objc_protocol_list *protocols;
-    auto RuntimeProtocols = GetRuntimeProtocolList(classDecl->protocol_begin(),
-                                                   classDecl->protocol_end());
+    auto RuntimeProtocols =
+        GetRuntimeProtocolList(classDecl->all_referenced_protocol_begin(),
+                               classDecl->all_referenced_protocol_end());
     SmallVector<llvm::Constant *, 16> Protocols;
     for (const auto *I : RuntimeProtocols)
       Protocols.push_back(GenerateProtocolRef(I));
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index a52c92cdbc83b..8e71a576552d3 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -1935,7 +1935,9 @@ CGObjCCommonMac::GenerateConstantNSString(const StringLiteral *Literal) {
   auto Fields = Builder.beginStruct(NSConstantStringType);
 
   // Class pointer.
-  Fields.add(Class);
+  Fields.addSignedPointer(Class,
+                          CGM.getCodeGenOpts().PointerAuth.ObjCIsaPointers,
+                          GlobalDecl(), QualType());
 
   // String pointer.
   llvm::Constant *C =
@@ -4975,10 +4977,7 @@ enum ImageInfoFlags {
   eImageInfo_GCOnly              = (1 << 2),
   eImageInfo_OptimizedByDyld     = (1 << 3), // This flag is set by the dyld shared cache.
 
-  // A flag indicating that the module has no instances of a @synthesize of a
-  // superclass variable. This flag used to be consumed by the runtime to work
-  // around miscompile by gcc.
-  eImageInfo_CorrectedSynthesize = (1 << 4), // This flag is no longer set by clang.
+  eImageInfo_SignedClassRO       = (1 << 4), // Reused (was _CorrectedSynthesize)
   eImageInfo_ImageIsSimulated    = (1 << 5),
   eImageInfo_ClassProperties     = (1 << 6)
 };
@@ -5036,6 +5035,17 @@ void CGObjCCommonMac::EmitImageInfo() {
   // Indicate whether we are generating class properties.
   Mod.addModuleFlag(llvm::Module::Error, "Objective-C Class Properties",
                     eImageInfo_ClassProperties);
+
+  // Indicate whether we want enforcement of pointer signing for class_ro_t
+  // pointers.
+  if (CGM.getLangOpts().PointerAuthObjcClassROPointers)
+    Mod.addModuleFlag(llvm::Module::Error,
+                      "Objective-C Enforce ClassRO Pointer Signing",
+                      eImageInfo_SignedClassRO);
+  else
+    Mod.addModuleFlag(llvm::Module::Error,
+                      "Objective-C Enforce ClassRO Pointer Signing",
+                      llvm::ConstantInt::get(Int8Ty, 0));
 }
 
 // struct objc_module {
@@ -6223,11 +6233,19 @@ llvm::GlobalVariable *CGObjCNonFragileABIMac::BuildClassRoTInitializer(
         methods.push_back(MD);
   }
 
-  values.add(emitMethodList(ID->getObjCRuntimeNameAsString(),
-                            (flags & NonFragileABI_Class_Meta)
-                                ? MethodListType::ClassMethods
-                                : MethodListType::InstanceMethods,
-                            methods));
+  llvm::Constant *MethListPtr = emitMethodList(
+      ID->getObjCRuntimeNameAsString(),
+      (flags & NonFragileABI_Class_Meta) ? MethodListType::ClassMethods
+                                         : MethodListType::InstanceMethods,
+      methods);
+
+  const PointerAuthSchema &MethListSchema =
+      CGM.getCodeGenOpts().PointerAuth.ObjCMethodListPointer;
+  if (!MethListPtr->isNullValue())
+    values.addSignedPointer(MethListPtr, MethListSchema, GlobalDecl(),
+                            QualType());
+  else
+    values.add(MethListPtr);
 
   const ObjCInterfaceDecl *OID = ID->getClassInterface();
   assert(OID && "CGObjCNonFragileABIMac::BuildClassRoTInitializer");
@@ -6275,15 +6293,20 @@ llvm::GlobalVariable *CGObjCNonFragileABIMac::BuildClassObject(
     bool HiddenVisibility) {
   ConstantInitBuilder builder(CGM);
   auto values = builder.beginStruct(ObjCTypes.ClassnfABITy);
-  values.add(IsAGV);
-  if (SuperClassGV) {
-    values.add(SuperClassGV);
-  } else {
+  const PointerAuthOptions &PointerAuthOpts = CGM.getCodeGenOpts().PointerAuth;
+  values.addSignedPointer(IsAGV, PointerAuthOpts.ObjCIsaPointers, GlobalDecl(),
+                          QualType());
+  if (SuperClassGV)
+    values.addSignedPointer(SuperClassGV, PointerAuthOpts.ObjCSuperPointers,
+                            GlobalDecl(), QualType());
+  else
     values.addNullPointer(ObjCTypes.ClassnfABIPtrTy);
-  }
+
   values.add(ObjCEmptyCacheVar);
   values.add(ObjCEmptyVtableVar);
-  values.add(ClassRoGV);
+
+  values.addSignedPointer(ClassRoGV, PointerAuthOpts.ObjCClassROPointers,
+                          GlobalDecl(), QualType());
 
   llvm::GlobalVariable *GV = cast<llvm::GlobalVariable>(
       GetClassGlobal(CI, isMetaclass, ForDefinition));
@@ -6543,15 +6566,27 @@ void CGObjCNonFragileABIMac::GenerateCategory(const ObjCCategoryImplDecl *OCD) {
     }
   }
 
-  auto instanceMethodList = emitMethodList(
+  llvm::Constant *InstanceMethodList = emitMethodList(
       listName, MethodListType::CategoryInstanceMethods, instanceMethods);
-  auto classMethodList = emitMethodList(
+  const PointerAuthSchema &MethListSchema =
+      CGM.getCodeGenOpts().PointerAuth.ObjCMethodListPointer;
+  if (!InstanceMethodList->isNullValue())
+    values.addSignedPointer(InstanceMethodList, MethListSchema, GlobalDecl(),
+                            QualType());
+  else
+    values.add(InstanceMethodList);
+
+  llvm::Constant *ClassMethodList = emitMethodList(
       listName, MethodListType::CategoryClassMethods, classMethods);
-  values.add(instanceMethodList);
-  values.add(classMethodList);
+  if (!ClassMethodList->isNullValue())
+    values.addSignedPointer(ClassMethodList, MethListSchema, GlobalDecl(),
+                            QualType());
+  else
+    values.add(ClassMethodList);
+
   // Keep track of whether we have actual metadata to emit.
   bool isEmptyCategory =
-      instanceMethodList->isNullValue() && classMethodList->isNullValue();
+      InstanceMethodList->isNullValue() && ClassMethodList->isNullValue();
 
   const ObjCCategoryDecl *Category =
       Interface->FindCategoryDeclaration(OCD->getIdentifier());
@@ -6629,7 +6664,13 @@ void CGObjCNonFragileABIMac::emitMethodConstant(ConstantArrayBuilder &builder,
   } else {
     llvm::Function *fn = GetMethodDefinition(MD);
     assert(fn && "no definition for method?");
-    method.add(fn);
+    if (const PointerAuthSchema &Schema =
+            CGM.getCodeGenOpts().PointerAuth.ObjCMethodListFunctionPointers) {
+      llvm::Constant *Bitcast =
+          llvm::ConstantExpr::getBitCast(fn, ObjCTypes.Int8PtrProgramASTy);
+      method.addSignedPointer(Bitcast, Schema, GlobalDecl(), QualType());
+    } else
+      method.add(fn);
   }
 
   method.finishAndAddTo(builder);
@@ -7672,10 +7713,15 @@ CGObjCNonFragileABIMac::GetInterfaceEHType(const ObjCInterfaceDecl *ID,
   }
 
   llvm::Value *VTableIdx = llvm::ConstantInt::get(CGM.Int32Ty, 2);
+  llvm::Constant *VTablePtr = llvm::ConstantExpr::getInBoundsGetElementPtr(
+      VTableGV->getValueType(), VTableGV, VTableIdx);
+
   ConstantInitBuilder builder(CGM);
   auto values = builder.beginStruct(ObjCTypes.EHTypeTy);
-  values.add(llvm::ConstantExpr::getInBoundsGetElementPtr(
-      VTableGV->getValueType(), VTableGV, VTableIdx));
+  const PointerAuthSchema &TypeInfoSchema =
+      CGM.getCodeGenOpts().PointerAuth.CXXTypeInfoVTablePointer;
+  values.addSignedPointer(VTablePtr, TypeInfoSchema, GlobalDecl(), QualType());
+
   values.add(GetClassName(ClassName));
   values.add(GetClassGlobal(ID, /*metaclass*/ false, NotForDefinition));
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a5f2f0efa2c3b..ce2dd4d76368a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1092,7 +1092,7 @@ emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty,
   auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
                                     Name, &CGM.getModule());
   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, FnInfo);
-  if (CGM.getLangOpts().Optimize) {
+  if (CGM.getCodeGenOpts().OptimizationLevel != 0) {
     Fn->removeFnAttr(llvm::Attribute::NoInline);
     Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
     Fn->addFnAttr(llvm::Attribute::AlwaysInline);
@@ -3199,7 +3199,7 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
       &CGM.getModule());
   CGM.SetInternalFunctionAttributes(GlobalDecl(), TaskPrivatesMap,
                                     TaskPrivatesMapFnInfo);
-  if (CGM.getLangOpts().Optimize) {
+  if (CGM.getCodeGenOpts().OptimizationLevel != 0) {
     TaskPrivatesMap->removeFnAttr(llvm::Attribute::NoInline);
     TaskPrivatesMap->removeFnAttr(llvm::Attribute::OptimizeNone);
     TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline);
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index a05b31f971e18..0f2a352886e7f 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -144,7 +144,6 @@ add_clang_library(clangCodeGen
   Targets/MSP430.cpp
   Targets/Mips.cpp
   Targets/NVPTX.cpp
-  Targets/PNaCl.cpp
   Targets/PPC.cpp
   Targets/RISCV.cpp
   Targets/SPIR.cpp
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 5493cc92bd8b0..eb5b6045510df 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -995,7 +995,7 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
     std::vector<std::unique_ptr<ASTConsumer>> Consumers(2);
     Consumers[0] = std::make_unique<ReducedBMIGenerator>(
         CI.getPreprocessor(), CI.getModuleCache(),
-        CI.getFrontendOpts().ModuleOutputPath);
+        CI.getFrontendOpts().ModuleOutputPath, CI.getCodeGenOpts());
     Consumers[1] = std::move(Result);
     return std::make_unique<MultiplexConsumer>(std::move(Consumers));
   }
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 776a646ceb32f..0fda31c8e5fa1 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -161,8 +161,7 @@ void CodeGenFunction::CGFPOptionsRAII::ConstructorHelper(FPOptions FPFeatures) {
   llvm::RoundingMode NewRoundingBehavior = FPFeatures.getRoundingMode();
   CGF.Builder.setDefaultConstrainedRounding(NewRoundingBehavior);
   auto NewExceptionBehavior =
-      ToConstrainedExceptMD(static_cast<LangOptions::FPExceptionModeKind>(
-          FPFeatures.getExceptionMode()));
+      ToConstrainedExceptMD(FPFeatures.getExceptionMode());
   CGF.Builder.setDefaultConstrainedExcept(NewExceptionBehavior);
 
   CGF.SetFastMathFlags(FPFeatures);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c8866f15745c2..236cc3d9e8907 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -118,9 +118,7 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
     return createM68kTargetCodeGenInfo(CGM);
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
-    if (Triple.getOS() == llvm::Triple::NaCl)
-      return createPNaClTargetCodeGenInfo(CGM);
-    else if (Triple.getOS() == llvm::Triple::Win32)
+    if (Triple.getOS() == llvm::Triple::Win32)
       return createWindowsMIPSTargetCodeGenInfo(CGM, /*IsOS32=*/true);
     return createMIPSTargetCodeGenInfo(CGM, /*IsOS32=*/true);
 
@@ -6616,7 +6614,9 @@ CodeGenModule::GetAddrOfConstantCFString(const StringLiteral *Literal) {
   auto Fields = Builder.beginStruct(STy);
 
   // Class pointer.
-  Fields.add(cast<llvm::Constant>(CFConstantStringClassRef));
+  Fields.addSignedPointer(cast<llvm::Constant>(CFConstantStringClassRef),
+                          getCodeGenOpts().PointerAuth.ObjCIsaPointers,
+                          GlobalDecl(), QualType());
 
   // Flags.
   if (IsSwiftABI) {
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index bd44874eac470..a7d796ecccc61 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -411,6 +411,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_rcp:
   case AMDGPU::BI__builtin_amdgcn_rcpf:
   case AMDGPU::BI__builtin_amdgcn_rcph:
+  case AMDGPU::BI__builtin_amdgcn_rcp_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rcp);
   case AMDGPU::BI__builtin_amdgcn_sqrt:
   case AMDGPU::BI__builtin_amdgcn_sqrtf:
@@ -420,6 +421,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_rsq:
   case AMDGPU::BI__builtin_amdgcn_rsqf:
   case AMDGPU::BI__builtin_amdgcn_rsqh:
+  case AMDGPU::BI__builtin_amdgcn_rsq_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq);
   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
   case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
@@ -427,15 +429,19 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                                Intrinsic::amdgcn_rsq_clamp);
   case AMDGPU::BI__builtin_amdgcn_sinf:
   case AMDGPU::BI__builtin_amdgcn_sinh:
+  case AMDGPU::BI__builtin_amdgcn_sin_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin);
   case AMDGPU::BI__builtin_amdgcn_cosf:
   case AMDGPU::BI__builtin_amdgcn_cosh:
+  case AMDGPU::BI__builtin_amdgcn_cos_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos);
   case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
     return EmitAMDGPUDispatchPtr(*this, E);
   case AMDGPU::BI__builtin_amdgcn_logf:
+  case AMDGPU::BI__builtin_amdgcn_log_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
   case AMDGPU::BI__builtin_amdgcn_exp2f:
+  case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E,
                                                Intrinsic::amdgcn_exp2);
   case AMDGPU::BI__builtin_amdgcn_log_clampf:
@@ -497,6 +503,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
     return Builder.CreateCall(F, { Src });
   }
+  case AMDGPU::BI__builtin_amdgcn_tanhf:
+  case AMDGPU::BI__builtin_amdgcn_tanhh:
   case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
     return emitBuiltinWithOneOverloadedType<1>(*this, E,
                                                Intrinsic::amdgcn_tanh);
@@ -821,7 +829,46 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
   case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
-  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+  // GFX1250 WMMA builtins
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
 
     // These operations perform a matrix multiplication and accumulation of
     // the form:
@@ -836,6 +883,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     // "false".
     bool AppendFalseForOpselArg = false;
     unsigned BuiltinWMMAOp;
+    // Need return type when D and C are of different types.
+    bool NeedReturnType = false;
 
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -974,6 +1023,160 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
       BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
       break;
+    // GFX1250 WMMA builtins
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+      ArgsForMatchingMatrixTypes = {5, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+      NeedReturnType = true;
+      ArgsForMatchingMatrixTypes = {1, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {3, 0};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+      ArgsForMatchingMatrixTypes = {4, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+      ArgsForMatchingMatrixTypes = {3, 0, 1};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
+      break;
     }
 
     SmallVector<Value *, 6> Args;
@@ -983,6 +1186,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       Args.push_back(Builder.getFalse());
 
     SmallVector<llvm::Type *, 6> ArgTypes;
+    if (NeedReturnType)
+      ArgTypes.push_back(ConvertType(E->getType()));
     for (auto ArgIdx : ArgsForMatchingMatrixTypes)
       ArgTypes.push_back(Args[ArgIdx]->getType());
 
diff --git a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
index f9890285f0aab..270e9fc976f23 100644
--- a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
@@ -1151,6 +1151,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
       Value *Acc = Builder.CreateLoad(Addr);
       CallOps.push_back(Acc);
     }
+    if (BuiltinID == PPC::BI__builtin_mma_dmmr ||
+        BuiltinID == PPC::BI__builtin_mma_dmxor) {
+      Address Addr = EmitPointerWithAlignment(E->getArg(1));
+      Ops[1] = Builder.CreateLoad(Addr);
+    }
     for (unsigned i=1; i<Ops.size(); i++)
       CallOps.push_back(Ops[i]);
     llvm::Function *F = CGM.getIntrinsic(ID);
diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
index 89e3f6f203df3..b08a0588c5ac1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
@@ -413,6 +413,12 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
     ID = Intrinsic::riscv_cv_alu_subuRN;
     break;
 
+  // XAndesBFHCvt
+  case RISCV::BI__builtin_riscv_nds_fcvt_s_bf16:
+    return Builder.CreateFPExt(Ops[0], FloatTy);
+  case RISCV::BI__builtin_riscv_nds_fcvt_bf16_s:
+    return Builder.CreateFPTrunc(Ops[0], BFloatTy);
+
     // Vector builtins are handled from here.
 #include "clang/Basic/riscv_vector_builtin_cg.inc"
 
diff --git a/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp b/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp
index 16243951c7bec..243aad8bf7083 100644
--- a/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp
@@ -58,6 +58,18 @@ Value *CodeGenFunction::EmitSPIRVBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/I->getType(), Intrinsic::spv_reflect,
         ArrayRef<Value *>{I, N}, nullptr, "spv.reflect");
   }
+  case SPIRV::BI__builtin_spirv_refract: {
+    Value *I = EmitScalarExpr(E->getArg(0));
+    Value *N = EmitScalarExpr(E->getArg(1));
+    Value *eta = EmitScalarExpr(E->getArg(2));
+    assert(E->getArg(0)->getType()->hasFloatingRepresentation() &&
+           E->getArg(1)->getType()->hasFloatingRepresentation() &&
+           E->getArg(2)->getType()->isFloatingType() &&
+           "refract operands must have a float representation");
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/I->getType(), Intrinsic::spv_refract,
+        ArrayRef<Value *>{I, N, eta}, nullptr, "spv.refract");
+  }
   case SPIRV::BI__builtin_spirv_smoothstep: {
     Value *Min = EmitScalarExpr(E->getArg(0));
     Value *Max = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index b4057d369f988..d0edae1295094 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -545,9 +545,6 @@ createMSP430TargetCodeGenInfo(CodeGenModule &CGM);
 std::unique_ptr<TargetCodeGenInfo>
 createNVPTXTargetCodeGenInfo(CodeGenModule &CGM);
 
-std::unique_ptr<TargetCodeGenInfo>
-createPNaClTargetCodeGenInfo(CodeGenModule &CGM);
-
 enum class PPC64_SVR4_ABIKind {
   ELFv1 = 0,
   ELFv2,
diff --git a/clang/lib/CodeGen/Targets/PNaCl.cpp b/clang/lib/CodeGen/Targets/PNaCl.cpp
deleted file mode 100644
index 358010785850e..0000000000000
--- a/clang/lib/CodeGen/Targets/PNaCl.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-//===- PNaCl.cpp ----------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ABIInfoImpl.h"
-#include "TargetInfo.h"
-
-using namespace clang;
-using namespace clang::CodeGen;
-
-//===----------------------------------------------------------------------===//
-// le32/PNaCl bitcode ABI Implementation
-//
-// This is a simplified version of the x86_32 ABI.  Arguments and return values
-// are always passed on the stack.
-//===----------------------------------------------------------------------===//
-
-class PNaClABIInfo : public ABIInfo {
- public:
-  PNaClABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {}
-
-  ABIArgInfo classifyReturnType(QualType RetTy) const;
-  ABIArgInfo classifyArgumentType(QualType RetTy) const;
-
-  void computeInfo(CGFunctionInfo &FI) const override;
-  RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
-                   AggValueSlot Slot) const override;
-};
-
-class PNaClTargetCodeGenInfo : public TargetCodeGenInfo {
- public:
-   PNaClTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
-       : TargetCodeGenInfo(std::make_unique<PNaClABIInfo>(CGT)) {}
-};
-
-void PNaClABIInfo::computeInfo(CGFunctionInfo &FI) const {
-  if (!getCXXABI().classifyReturnType(FI))
-    FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
-
-  for (auto &I : FI.arguments())
-    I.info = classifyArgumentType(I.type);
-}
-
-RValue PNaClABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
-                               QualType Ty, AggValueSlot Slot) const {
-  // The PNaCL ABI is a bit odd, in that varargs don't use normal
-  // function classification. Structs get passed directly for varargs
-  // functions, through a rewriting transform in
-  // pnacl-llvm/lib/Transforms/NaCl/ExpandVarArgs.cpp, which allows
-  // this target to actually support a va_arg instructions with an
-  // aggregate type, unlike other targets.
-  return CGF.EmitLoadOfAnyValue(
-      CGF.MakeAddrLValue(
-          EmitVAArgInstr(CGF, VAListAddr, Ty, ABIArgInfo::getDirect()), Ty),
-      Slot);
-}
-
-/// Classify argument of given type \p Ty.
-ABIArgInfo PNaClABIInfo::classifyArgumentType(QualType Ty) const {
-  if (isAggregateTypeForABI(Ty)) {
-    if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
-      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
-                                     RAA == CGCXXABI::RAA_DirectInMemory);
-    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
-  } else if (const EnumType *EnumTy = Ty->getAs<EnumType>()) {
-    // Treat an enum type as its underlying type.
-    Ty = EnumTy->getDecl()->getIntegerType();
-  } else if (Ty->isFloatingType()) {
-    // Floating-point types don't go inreg.
-    return ABIArgInfo::getDirect();
-  } else if (const auto *EIT = Ty->getAs<BitIntType>()) {
-    // Treat bit-precise integers as integers if <= 64, otherwise pass
-    // indirectly.
-    if (EIT->getNumBits() > 64)
-      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
-    return ABIArgInfo::getDirect();
-  }
-
-  return (isPromotableIntegerTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
-                                            : ABIArgInfo::getDirect());
-}
-
-ABIArgInfo PNaClABIInfo::classifyReturnType(QualType RetTy) const {
-  if (RetTy->isVoidType())
-    return ABIArgInfo::getIgnore();
-
-  // In the PNaCl ABI we always return records/structures on the stack.
-  if (isAggregateTypeForABI(RetTy))
-    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
-
-  // Treat bit-precise integers as integers if <= 64, otherwise pass indirectly.
-  if (const auto *EIT = RetTy->getAs<BitIntType>()) {
-    if (EIT->getNumBits() > 64)
-      return getNaturalAlignIndirect(RetTy,
-                                     getDataLayout().getAllocaAddrSpace());
-    return ABIArgInfo::getDirect();
-  }
-
-  // Treat an enum type as its underlying type.
-  if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
-    RetTy = EnumTy->getDecl()->getIntegerType();
-
-  return (isPromotableIntegerTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
-                                               : ABIArgInfo::getDirect());
-}
-
-std::unique_ptr<TargetCodeGenInfo>
-CodeGen::createPNaClTargetCodeGenInfo(CodeGenModule &CGM) {
-  return std::make_unique<PNaClTargetCodeGenInfo>(CGM.getTypes());
-}
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 0f59caac2323e..abb91486e7ee6 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -2470,13 +2470,12 @@ GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset,
   return llvm::Type::getDoubleTy(getVMContext());
 }
 
-
 /// GetINTEGERTypeAtOffset - The ABI specifies that a value should be passed in
-/// an 8-byte GPR.  This means that we either have a scalar or we are talking
-/// about the high or low part of an up-to-16-byte struct.  This routine picks
-/// the best LLVM IR type to represent this, which may be i64 or may be anything
-/// else that the backend will pass in a GPR that works better (e.g. i8, %foo*,
-/// etc).
+/// one or more 8-byte GPRs.  This means that we either have a scalar or we are
+/// talking about the high and/or low part of an up-to-16-byte struct.  This
+/// routine picks the best LLVM IR type to represent this, which may be i64 or
+/// may be anything else that the backend will pass in GPRs that works better
+/// (e.g. i8, %foo*, etc).
 ///
 /// PrefType is an LLVM IR type that corresponds to (part of) the IR type for
 /// the source type.  IROffset is an offset in bytes into the LLVM IR type that
@@ -2534,6 +2533,13 @@ GetINTEGERTypeAtOffset(llvm::Type *IRType, unsigned IROffset,
                                   SourceOffset);
   }
 
+  // if we have a 128-bit integer, we can pass it safely using an i128
+  // so we return that
+  if (IRType->isIntegerTy(128)) {
+    assert(IROffset == 0);
+    return IRType;
+  }
+
   // Okay, we don't have any better idea of what to pass, so we pass this in an
   // integer register that isn't too big to fit the rest of the struct.
   unsigned TySizeInBytes =
@@ -2572,8 +2578,7 @@ GetX86_64ByValArgumentPair(llvm::Type *Lo, llvm::Type *Hi,
   if (HiStart != 8) {
     // There are usually two sorts of types the ABI generation code can produce
     // for the low part of a pair that aren't 8 bytes in size: half, float or
-    // i8/i16/i32.  This can also include pointers when they are 32-bit (X32 and
-    // NaCl).
+    // i8/i16/i32.  This can also include pointers when they are 32-bit (X32).
     // Promote these to a larger type.
     if (Lo->isHalfTy() || Lo->isFloatTy())
       Lo = llvm::Type::getDoubleTy(Lo->getContext());
@@ -2592,8 +2597,7 @@ GetX86_64ByValArgumentPair(llvm::Type *Lo, llvm::Type *Hi,
   return Result;
 }
 
-ABIArgInfo X86_64ABIInfo::
-classifyReturnType(QualType RetTy) const {
+ABIArgInfo X86_64ABIInfo::classifyReturnType(QualType RetTy) const {
   // AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the
   // classification algorithm.
   X86_64ABIInfo::Class Lo, Hi;
@@ -2639,6 +2643,12 @@ classifyReturnType(QualType RetTy) const {
           isPromotableIntegerTypeForABI(RetTy))
         return ABIArgInfo::getExtend(RetTy);
     }
+
+    if (ResType->isIntegerTy(128)) {
+      // i128 are passed directly
+      assert(Hi == Integer);
+      return ABIArgInfo::getDirect(ResType);
+    }
     break;
 
     // AMD64-ABI 3.2.3p4: Rule 4. If the class is SSE, the next
@@ -2784,6 +2794,11 @@ X86_64ABIInfo::classifyArgumentType(QualType Ty, unsigned freeIntRegs,
         return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty));
     }
 
+    if (ResType->isIntegerTy(128)) {
+      assert(Hi == Integer);
+      ++neededInt;
+      return ABIArgInfo::getDirect(ResType);
+    }
     break;
 
     // AMD64-ABI 3.2.3p3: Rule 3. If the class is SSE, the next
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 0268ec49eaf11..45782cbd9d16d 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -71,7 +71,6 @@ add_clang_library(clangDriver
   ToolChains/MinGW.cpp
   ToolChains/MSP430.cpp
   ToolChains/MSVC.cpp
-  ToolChains/NaCl.cpp
   ToolChains/NetBSD.cpp
   ToolChains/OHOS.cpp
   ToolChains/OpenBSD.cpp
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 55748c0bce918..ec1135eecd401 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -36,7 +36,6 @@
 #include "ToolChains/Managarm.h"
 #include "ToolChains/MinGW.h"
 #include "ToolChains/MipsLinux.h"
-#include "ToolChains/NaCl.h"
 #include "ToolChains/NetBSD.h"
 #include "ToolChains/OHOS.h"
 #include "ToolChains/OpenBSD.h"
@@ -6841,9 +6840,6 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
       else
         TC = std::make_unique<toolchains::Linux>(*this, Target, Args);
       break;
-    case llvm::Triple::NaCl:
-      TC = std::make_unique<toolchains::NaClToolChain>(*this, Target, Args);
-      break;
     case llvm::Triple::Fuchsia:
       TC = std::make_unique<toolchains::Fuchsia>(*this, Target, Args);
       break;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index fe1865888bdd0..8880c9375143f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1746,6 +1746,13 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
   Args.addOptInFlag(CmdArgs, options::OPT_faarch64_jump_table_hardening,
                     options::OPT_fno_aarch64_jump_table_hardening);
 
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_objc_isa,
+                    options::OPT_fno_ptrauth_objc_isa);
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_objc_interface_sel,
+                    options::OPT_fno_ptrauth_objc_interface_sel);
+  Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_objc_class_ro,
+                    options::OPT_fno_ptrauth_objc_class_ro);
+
   if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
     handlePAuthABI(Args, CmdArgs);
 
@@ -3839,15 +3846,6 @@ static void RenderOpenACCOptions(const Driver &D, const ArgList &Args,
     return;
 
   CmdArgs.push_back("-fopenacc");
-
-  if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override)) {
-    StringRef Value = A->getValue();
-    int Version;
-    if (!Value.getAsInteger(10, Version))
-      A->renderAsInput(Args, CmdArgs);
-    else
-      D.Diag(diag::err_drv_clang_unsupported) << Value;
-  }
 }
 
 static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 097d186ad8ea4..651a39c03bb54 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1320,6 +1320,17 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (Args.hasArg(options::OPT_ftime_report))
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-time-passes"));
+
+  if (Arg *A = Args.getLastArg(options::OPT_fthinlto_distributor_EQ)) {
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-distributor=" + Twine(A->getValue())));
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-remote-compiler=" +
+                           Twine(ToolChain.getDriver().getClangProgramPath())));
+
+    for (auto A : Args.getAllArgValues(options::OPT_Xthinlto_distributor_EQ))
+      CmdArgs.push_back(Args.MakeArgString("--thinlto-distributor-arg=" + A));
+  }
 }
 
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index f5e2655857432..298d227403f9a 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -591,12 +591,41 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // Facebook T92898286
+  if (Args.hasArg(options::OPT_post_link_optimize))
+    CmdArgs.push_back("-q");
+  // End Facebook T92898286
+
   Args.addAllArgs(CmdArgs, {options::OPT_T, options::OPT_t});
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
                                          Exec, CmdArgs, Inputs, Output));
+  // Facebook T92898286
+  if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename())
+    return;
+
+  const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv"));
+  ArgStringList MoveCmdArgs;
+  MoveCmdArgs.push_back(Output.getFilename());
+  const char *PreBoltBin =
+      Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt");
+  MoveCmdArgs.push_back(PreBoltBin);
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         MvExec, MoveCmdArgs, std::nullopt));
+
+  ArgStringList BoltCmdArgs;
+  const char *BoltExec =
+      Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt"));
+  BoltCmdArgs.push_back(PreBoltBin);
+  BoltCmdArgs.push_back("-reorder-blocks=reverse");
+  BoltCmdArgs.push_back("-update-debug-sections");
+  BoltCmdArgs.push_back("-o");
+  BoltCmdArgs.push_back(Output.getFilename());
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         BoltExec, BoltCmdArgs, std::nullopt));
+  // End Facebook T92898286
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 7d093d20b3dd9..b2e36ae6f97c3 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -132,7 +132,9 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("thumb2pe");
     break;
   case llvm::Triple::aarch64:
-    if (TC.getEffectiveTriple().isWindowsArm64EC())
+    if (Args.hasArg(options::OPT_marm64x))
+      CmdArgs.push_back("arm64xpe");
+    else if (TC.getEffectiveTriple().isWindowsArm64EC())
       CmdArgs.push_back("arm64ecpe");
     else
       CmdArgs.push_back("arm64pe");
diff --git a/clang/lib/Driver/ToolChains/NaCl.cpp b/clang/lib/Driver/ToolChains/NaCl.cpp
deleted file mode 100644
index f7acaa1f3a78d..0000000000000
--- a/clang/lib/Driver/ToolChains/NaCl.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-//===--- NaCl.cpp - Native Client ToolChain Implementations -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "NaCl.h"
-#include "clang/Driver/CommonArgs.h"
-#include "clang/Driver/Compilation.h"
-#include "clang/Driver/Driver.h"
-#include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Support/Path.h"
-
-using namespace clang::driver;
-using namespace clang::driver::tools;
-using namespace clang::driver::toolchains;
-using namespace clang;
-using namespace llvm::opt;
-
-// NaCl ARM assembly (inline or standalone) can be written with a set of macros
-// for the various SFI requirements like register masking. The assembly tool
-// inserts the file containing the macros as an input into all the assembly
-// jobs.
-void nacltools::AssemblerARM::ConstructJob(Compilation &C, const JobAction &JA,
-                                           const InputInfo &Output,
-                                           const InputInfoList &Inputs,
-                                           const ArgList &Args,
-                                           const char *LinkingOutput) const {
-  const auto &ToolChain = static_cast<const NaClToolChain &>(getToolChain());
-  InputInfo NaClMacros(types::TY_PP_Asm, ToolChain.GetNaClArmMacrosPath(),
-                       "nacl-arm-macros.s");
-  InputInfoList NewInputs;
-  NewInputs.push_back(NaClMacros);
-  NewInputs.append(Inputs.begin(), Inputs.end());
-  gnutools::Assembler::ConstructJob(C, JA, Output, NewInputs, Args,
-                                    LinkingOutput);
-}
-
-// This is quite similar to gnutools::Linker::ConstructJob with changes that
-// we use static by default, do not yet support sanitizers or LTO, and a few
-// others. Eventually we can support more of that and hopefully migrate back
-// to gnutools::Linker.
-void nacltools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
-                                     const InputInfo &Output,
-                                     const InputInfoList &Inputs,
-                                     const ArgList &Args,
-                                     const char *LinkingOutput) const {
-
-  const auto &ToolChain = static_cast<const NaClToolChain &>(getToolChain());
-  const Driver &D = ToolChain.getDriver();
-  const llvm::Triple::ArchType Arch = ToolChain.getArch();
-  const bool IsStatic =
-      !Args.hasArg(options::OPT_dynamic) && !Args.hasArg(options::OPT_shared);
-
-  ArgStringList CmdArgs;
-
-  // Silence warning for "clang -g foo.o -o foo"
-  Args.ClaimAllArgs(options::OPT_g_Group);
-  // and "clang -emit-llvm foo.o -o foo"
-  Args.ClaimAllArgs(options::OPT_emit_llvm);
-  // and for "clang -w foo.o -o foo". Other warning options are already
-  // handled somewhere else.
-  Args.ClaimAllArgs(options::OPT_w);
-
-  if (!D.SysRoot.empty())
-    CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot));
-
-  if (Args.hasArg(options::OPT_rdynamic))
-    CmdArgs.push_back("-export-dynamic");
-
-  if (Args.hasArg(options::OPT_s))
-    CmdArgs.push_back("-s");
-
-  // NaClToolChain doesn't have ExtraOpts like Linux; the only relevant flag
-  // from there is --build-id, which we do want.
-  CmdArgs.push_back("--build-id");
-
-  if (!IsStatic)
-    CmdArgs.push_back("--eh-frame-hdr");
-
-  CmdArgs.push_back("-m");
-  if (Arch == llvm::Triple::x86)
-    CmdArgs.push_back("elf_i386_nacl");
-  else if (Arch == llvm::Triple::arm)
-    CmdArgs.push_back("armelf_nacl");
-  else if (Arch == llvm::Triple::x86_64)
-    CmdArgs.push_back("elf_x86_64_nacl");
-  else if (Arch == llvm::Triple::mipsel)
-    CmdArgs.push_back("mipselelf_nacl");
-  else
-    D.Diag(diag::err_target_unsupported_arch) << ToolChain.getArchName()
-                                              << "Native Client";
-
-  if (IsStatic)
-    CmdArgs.push_back("-static");
-  else if (Args.hasArg(options::OPT_shared))
-    CmdArgs.push_back("-shared");
-
-  CmdArgs.push_back("-o");
-  CmdArgs.push_back(Output.getFilename());
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
-    if (!Args.hasArg(options::OPT_shared))
-      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt1.o")));
-    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o")));
-
-    const char *crtbegin;
-    if (IsStatic)
-      crtbegin = "crtbeginT.o";
-    else if (Args.hasArg(options::OPT_shared))
-      crtbegin = "crtbeginS.o";
-    else
-      crtbegin = "crtbegin.o";
-    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtbegin)));
-  }
-
-  Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_u});
-
-  ToolChain.AddFilePathLibArgs(Args, CmdArgs);
-
-  if (Args.hasArg(options::OPT_Z_Xlinker__no_demangle))
-    CmdArgs.push_back("--no-demangle");
-
-  AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA);
-
-  if (D.CCCIsCXX() &&
-      !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
-    if (ToolChain.ShouldLinkCXXStdlib(Args)) {
-      bool OnlyLibstdcxxStatic =
-          Args.hasArg(options::OPT_static_libstdcxx) && !IsStatic;
-      if (OnlyLibstdcxxStatic)
-        CmdArgs.push_back("-Bstatic");
-      ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
-      if (OnlyLibstdcxxStatic)
-        CmdArgs.push_back("-Bdynamic");
-    }
-    CmdArgs.push_back("-lm");
-  }
-
-  if (!Args.hasArg(options::OPT_nostdlib)) {
-    if (!Args.hasArg(options::OPT_nodefaultlibs)) {
-      // Always use groups, since it has no effect on dynamic libraries.
-      CmdArgs.push_back("--start-group");
-      CmdArgs.push_back("-lc");
-      // NaCl's libc++ currently requires libpthread, so just always include it
-      // in the group for C++.
-      if (Args.hasArg(options::OPT_pthread) ||
-          Args.hasArg(options::OPT_pthreads) || D.CCCIsCXX()) {
-        // Gold, used by Mips, handles nested groups differently than ld, and
-        // without '-lnacl' it prefers symbols from libpthread.a over libnacl.a,
-        // which is not a desired behaviour here.
-        // See https://sourceware.org/ml/binutils/2015-03/msg00034.html
-        if (getToolChain().getArch() == llvm::Triple::mipsel)
-          CmdArgs.push_back("-lnacl");
-
-        CmdArgs.push_back("-lpthread");
-      }
-
-      CmdArgs.push_back("-lgcc");
-      CmdArgs.push_back("--as-needed");
-      if (IsStatic)
-        CmdArgs.push_back("-lgcc_eh");
-      else
-        CmdArgs.push_back("-lgcc_s");
-      CmdArgs.push_back("--no-as-needed");
-
-      // Mips needs to create and use pnacl_legacy library that contains
-      // definitions from bitcode/pnaclmm.c and definitions for
-      // __nacl_tp_tls_offset() and __nacl_tp_tdb_offset().
-      if (getToolChain().getArch() == llvm::Triple::mipsel)
-        CmdArgs.push_back("-lpnacl_legacy");
-
-      CmdArgs.push_back("--end-group");
-    }
-
-    if (!Args.hasArg(options::OPT_nostartfiles)) {
-      const char *crtend;
-      if (Args.hasArg(options::OPT_shared))
-        crtend = "crtendS.o";
-      else
-        crtend = "crtend.o";
-
-      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtend)));
-      CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtn.o")));
-    }
-  }
-
-  const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(JA, *this,
-                                         ResponseFileSupport::AtFileCurCP(),
-                                         Exec, CmdArgs, Inputs, Output));
-}
-
-/// NaCl Toolchain
-NaClToolChain::NaClToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ArgList &Args)
-    : Generic_ELF(D, Triple, Args) {
-
-  // Remove paths added by Generic_GCC. NaCl Toolchain cannot use the
-  // default paths, and must instead only use the paths provided
-  // with this toolchain based on architecture.
-  path_list &file_paths = getFilePaths();
-  path_list &prog_paths = getProgramPaths();
-
-  file_paths.clear();
-  prog_paths.clear();
-
-  // Path for library files (libc.a, ...)
-  std::string FilePath(getDriver().Dir + "/../");
-
-  // Path for tools (clang, ld, etc..)
-  std::string ProgPath(getDriver().Dir + "/../");
-
-  // Path for toolchain libraries (libgcc.a, ...)
-  std::string ToolPath(getDriver().ResourceDir + "/lib/");
-
-  switch (Triple.getArch()) {
-  case llvm::Triple::x86:
-    file_paths.push_back(FilePath + "x86_64-nacl/lib32");
-    file_paths.push_back(FilePath + "i686-nacl/usr/lib");
-    prog_paths.push_back(ProgPath + "x86_64-nacl/bin");
-    file_paths.push_back(ToolPath + "i686-nacl");
-    break;
-  case llvm::Triple::x86_64:
-    file_paths.push_back(FilePath + "x86_64-nacl/lib");
-    file_paths.push_back(FilePath + "x86_64-nacl/usr/lib");
-    prog_paths.push_back(ProgPath + "x86_64-nacl/bin");
-    file_paths.push_back(ToolPath + "x86_64-nacl");
-    break;
-  case llvm::Triple::arm:
-    file_paths.push_back(FilePath + "arm-nacl/lib");
-    file_paths.push_back(FilePath + "arm-nacl/usr/lib");
-    prog_paths.push_back(ProgPath + "arm-nacl/bin");
-    file_paths.push_back(ToolPath + "arm-nacl");
-    break;
-  case llvm::Triple::mipsel:
-    file_paths.push_back(FilePath + "mipsel-nacl/lib");
-    file_paths.push_back(FilePath + "mipsel-nacl/usr/lib");
-    prog_paths.push_back(ProgPath + "bin");
-    file_paths.push_back(ToolPath + "mipsel-nacl");
-    break;
-  default:
-    break;
-  }
-
-  NaClArmMacrosPath = GetFilePath("nacl-arm-macros.s");
-}
-
-void NaClToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
-                                              ArgStringList &CC1Args) const {
-  const Driver &D = getDriver();
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
-    return;
-
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    SmallString<128> P(D.ResourceDir);
-    llvm::sys::path::append(P, "include");
-    addSystemInclude(DriverArgs, CC1Args, P.str());
-  }
-
-  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
-    return;
-
-  SmallString<128> P(D.Dir + "/../");
-  switch (getTriple().getArch()) {
-  case llvm::Triple::x86:
-    // x86 is special because multilib style uses x86_64-nacl/include for libc
-    // headers but the SDK wants i686-nacl/usr/include. The other architectures
-    // have the same substring.
-    llvm::sys::path::append(P, "i686-nacl/usr/include");
-    addSystemInclude(DriverArgs, CC1Args, P.str());
-    llvm::sys::path::remove_filename(P);
-    llvm::sys::path::remove_filename(P);
-    llvm::sys::path::remove_filename(P);
-    llvm::sys::path::append(P, "x86_64-nacl/include");
-    addSystemInclude(DriverArgs, CC1Args, P.str());
-    return;
-  case llvm::Triple::arm:
-    llvm::sys::path::append(P, "arm-nacl/usr/include");
-    break;
-  case llvm::Triple::x86_64:
-    llvm::sys::path::append(P, "x86_64-nacl/usr/include");
-    break;
-  case llvm::Triple::mipsel:
-    llvm::sys::path::append(P, "mipsel-nacl/usr/include");
-    break;
-  default:
-    return;
-  }
-
-  addSystemInclude(DriverArgs, CC1Args, P.str());
-  llvm::sys::path::remove_filename(P);
-  llvm::sys::path::remove_filename(P);
-  llvm::sys::path::append(P, "include");
-  addSystemInclude(DriverArgs, CC1Args, P.str());
-}
-
-void NaClToolChain::AddCXXStdlibLibArgs(const ArgList &Args,
-                                        ArgStringList &CmdArgs) const {
-  // Check for -stdlib= flags. We only support libc++ but this consumes the arg
-  // if the value is libc++, and emits an error for other values.
-  GetCXXStdlibType(Args);
-  CmdArgs.push_back("-lc++");
-  if (Args.hasArg(options::OPT_fexperimental_library))
-    CmdArgs.push_back("-lc++experimental");
-}
-
-void NaClToolChain::addLibCxxIncludePaths(
-    const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args) const {
-  const Driver &D = getDriver();
-
-  SmallString<128> P(D.Dir + "/../");
-  switch (getTriple().getArch()) {
-  default:
-    break;
-  case llvm::Triple::arm:
-    llvm::sys::path::append(P, "arm-nacl/include/c++/v1");
-    addSystemInclude(DriverArgs, CC1Args, P.str());
-    break;
-  case llvm::Triple::x86:
-    llvm::sys::path::append(P, "x86_64-nacl/include/c++/v1");
-    addSystemInclude(DriverArgs, CC1Args, P.str());
-    break;
-  case llvm::Triple::x86_64:
-    llvm::sys::path::append(P, "x86_64-nacl/include/c++/v1");
-    addSystemInclude(DriverArgs, CC1Args, P.str());
-    break;
-  case llvm::Triple::mipsel:
-    llvm::sys::path::append(P, "mipsel-nacl/include/c++/v1");
-    addSystemInclude(DriverArgs, CC1Args, P.str());
-    break;
-  }
-}
-
-ToolChain::CXXStdlibType
-NaClToolChain::GetCXXStdlibType(const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(options::OPT_stdlib_EQ)) {
-    StringRef Value = A->getValue();
-    if (Value == "libc++")
-      return ToolChain::CST_Libcxx;
-    getDriver().Diag(clang::diag::err_drv_invalid_stdlib_name)
-        << A->getAsString(Args);
-  }
-
-  return ToolChain::CST_Libcxx;
-}
-
-std::string
-NaClToolChain::ComputeEffectiveClangTriple(const ArgList &Args,
-                                           types::ID InputType) const {
-  llvm::Triple TheTriple(ComputeLLVMTriple(Args, InputType));
-  if (TheTriple.getArch() == llvm::Triple::arm &&
-      TheTriple.getEnvironment() == llvm::Triple::UnknownEnvironment)
-    TheTriple.setEnvironment(llvm::Triple::GNUEABIHF);
-  return TheTriple.getTriple();
-}
-
-Tool *NaClToolChain::buildLinker() const {
-  return new tools::nacltools::Linker(*this);
-}
-
-Tool *NaClToolChain::buildAssembler() const {
-  if (getTriple().getArch() == llvm::Triple::arm)
-    return new tools::nacltools::AssemblerARM(*this);
-  return new tools::gnutools::Assembler(*this);
-}
diff --git a/clang/lib/Driver/ToolChains/NaCl.h b/clang/lib/Driver/ToolChains/NaCl.h
deleted file mode 100644
index 01d4719e7b929..0000000000000
--- a/clang/lib/Driver/ToolChains/NaCl.h
+++ /dev/null
@@ -1,88 +0,0 @@
-//===--- NaCl.h - Native Client ToolChain Implementations -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_NACL_H
-#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_NACL_H
-
-#include "Gnu.h"
-#include "clang/Driver/Tool.h"
-#include "clang/Driver/ToolChain.h"
-
-namespace clang {
-namespace driver {
-namespace tools {
-namespace nacltools {
-class LLVM_LIBRARY_VISIBILITY AssemblerARM : public gnutools::Assembler {
-public:
-  AssemblerARM(const ToolChain &TC) : gnutools::Assembler(TC) {}
-
-  void ConstructJob(Compilation &C, const JobAction &JA,
-                    const InputInfo &Output, const InputInfoList &Inputs,
-                    const llvm::opt::ArgList &TCArgs,
-                    const char *LinkingOutput) const override;
-};
-
-class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
-public:
-  Linker(const ToolChain &TC) : Tool("NaCl::Linker", "linker", TC) {}
-
-  bool hasIntegratedCPP() const override { return false; }
-  bool isLinkJob() const override { return true; }
-
-  void ConstructJob(Compilation &C, const JobAction &JA,
-                    const InputInfo &Output, const InputInfoList &Inputs,
-                    const llvm::opt::ArgList &TCArgs,
-                    const char *LinkingOutput) const override;
-};
-} // end namespace nacltools
-} // end namespace tools
-
-namespace toolchains {
-
-class LLVM_LIBRARY_VISIBILITY NaClToolChain : public Generic_ELF {
-public:
-  NaClToolChain(const Driver &D, const llvm::Triple &Triple,
-                const llvm::opt::ArgList &Args);
-
-  void
-  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                            llvm::opt::ArgStringList &CC1Args) const override;
-  void addLibCxxIncludePaths(
-      const llvm::opt::ArgList &DriverArgs,
-      llvm::opt::ArgStringList &CC1Args) const override;
-
-  CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
-
-  void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
-                           llvm::opt::ArgStringList &CmdArgs) const override;
-
-  bool IsIntegratedAssemblerDefault() const override {
-    return getTriple().getArch() == llvm::Triple::mipsel;
-  }
-
-  // Get the path to the file containing NaCl's ARM macros.
-  // It lives in NaClToolChain because the ARMAssembler tool needs a
-  // const char * that it can pass around,
-  const char *GetNaClArmMacrosPath() const { return NaClArmMacrosPath.c_str(); }
-
-  std::string ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args,
-                                          types::ID InputType) const override;
-
-protected:
-  Tool *buildLinker() const override;
-  Tool *buildAssembler() const override;
-
-private:
-  std::string NaClArmMacrosPath;
-};
-
-} // end namespace toolchains
-} // end namespace driver
-} // end namespace clang
-
-#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_NACL_H
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 791afc1a97575..51a6f6b779e77 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -220,8 +220,9 @@ DeclarationFragmentsBuilder::getFragmentsForNNS(const NestedNameSpecifier *NNS,
     break;
 
   case NestedNameSpecifier::Namespace: {
-    const NamespaceDecl *NS = NNS->getAsNamespace();
-    if (NS->isAnonymousNamespace())
+    const NamespaceBaseDecl *NS = NNS->getAsNamespace();
+    if (const auto *Namespace = dyn_cast<NamespaceDecl>(NS);
+        Namespace && Namespace->isAnonymousNamespace())
       return Fragments;
     SmallString<128> USR;
     index::generateUSRForDecl(NS, USR);
@@ -230,16 +231,6 @@ DeclarationFragmentsBuilder::getFragmentsForNNS(const NestedNameSpecifier *NNS,
     break;
   }
 
-  case NestedNameSpecifier::NamespaceAlias: {
-    const NamespaceAliasDecl *Alias = NNS->getAsNamespaceAlias();
-    SmallString<128> USR;
-    index::generateUSRForDecl(Alias, USR);
-    Fragments.append(Alias->getName(),
-                     DeclarationFragments::FragmentKind::Identifier, USR,
-                     Alias);
-    break;
-  }
-
   case NestedNameSpecifier::Global:
     // The global specifier `::` at the beginning. No stored value.
     break;
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 78c09be458f0a..62feb3db0ed5e 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -665,21 +665,25 @@ template <> struct MappingTraits<FormatStyle::SortIncludesOptions> {
     IO.enumCase(Value, "Never", FormatStyle::SortIncludesOptions({}));
     IO.enumCase(Value, "CaseInsensitive",
                 FormatStyle::SortIncludesOptions({/*Enabled=*/true,
-                                                  /*IgnoreCase=*/true}));
+                                                  /*IgnoreCase=*/true,
+                                                  /*IgnoreExtension=*/false}));
     IO.enumCase(Value, "CaseSensitive",
                 FormatStyle::SortIncludesOptions({/*Enabled=*/true,
-                                                  /*IgnoreCase=*/false}));
+                                                  /*IgnoreCase=*/false,
+                                                  /*IgnoreExtension=*/false}));
 
     // For backward compatibility.
     IO.enumCase(Value, "false", FormatStyle::SortIncludesOptions({}));
     IO.enumCase(Value, "true",
                 FormatStyle::SortIncludesOptions({/*Enabled=*/true,
-                                                  /*IgnoreCase=*/false}));
+                                                  /*IgnoreCase=*/false,
+                                                  /*IgnoreExtension=*/false}));
   }
 
   static void mapping(IO &IO, FormatStyle::SortIncludesOptions &Value) {
     IO.mapOptional("Enabled", Value.Enabled);
     IO.mapOptional("IgnoreCase", Value.IgnoreCase);
+    IO.mapOptional("IgnoreExtension", Value.IgnoreExtension);
   }
 };
 
@@ -1650,7 +1654,8 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.SeparateDefinitionBlocks = FormatStyle::SDS_Leave;
   LLVMStyle.ShortNamespaceLines = 1;
   LLVMStyle.SkipMacroDefinitionBody = false;
-  LLVMStyle.SortIncludes = {/*Enabled=*/true, /*IgnoreCase=*/false};
+  LLVMStyle.SortIncludes = {/*Enabled=*/true, /*IgnoreCase=*/false,
+                            /*IgnoreExtension=*/false};
   LLVMStyle.SortJavaStaticImport = FormatStyle::SJSIO_Before;
   LLVMStyle.SortUsingDeclarations = FormatStyle::SUD_LexicographicNumeric;
   LLVMStyle.SpaceAfterCStyleCast = false;
@@ -3239,19 +3244,27 @@ static void sortCppIncludes(const FormatStyle &Style,
   SmallVector<unsigned, 16> Indices =
       llvm::to_vector<16>(llvm::seq<unsigned>(0, Includes.size()));
 
-  if (Style.SortIncludes.Enabled && Style.SortIncludes.IgnoreCase) {
+  if (Style.SortIncludes.Enabled) {
     stable_sort(Indices, [&](unsigned LHSI, unsigned RHSI) {
-      const auto LHSFilenameLower = Includes[LHSI].Filename.lower();
-      const auto RHSFilenameLower = Includes[RHSI].Filename.lower();
-      return std::tie(Includes[LHSI].Priority, LHSFilenameLower,
-                      Includes[LHSI].Filename) <
-             std::tie(Includes[RHSI].Priority, RHSFilenameLower,
-                      Includes[RHSI].Filename);
-    });
-  } else {
-    stable_sort(Indices, [&](unsigned LHSI, unsigned RHSI) {
-      return std::tie(Includes[LHSI].Priority, Includes[LHSI].Filename) <
-             std::tie(Includes[RHSI].Priority, Includes[RHSI].Filename);
+      SmallString<128> LHSStem, RHSStem;
+      if (Style.SortIncludes.IgnoreExtension) {
+        LHSStem = Includes[LHSI].Filename;
+        RHSStem = Includes[RHSI].Filename;
+        llvm::sys::path::replace_extension(LHSStem, "");
+        llvm::sys::path::replace_extension(RHSStem, "");
+      }
+      std::string LHSStemLower, RHSStemLower;
+      std::string LHSFilenameLower, RHSFilenameLower;
+      if (Style.SortIncludes.IgnoreCase) {
+        LHSStemLower = LHSStem.str().lower();
+        RHSStemLower = RHSStem.str().lower();
+        LHSFilenameLower = Includes[LHSI].Filename.lower();
+        RHSFilenameLower = Includes[RHSI].Filename.lower();
+      }
+      return std::tie(Includes[LHSI].Priority, LHSStemLower, LHSStem,
+                      LHSFilenameLower, Includes[LHSI].Filename) <
+             std::tie(Includes[RHSI].Priority, RHSStemLower, RHSStem,
+                      RHSFilenameLower, Includes[RHSI].Filename);
     });
   }
 
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 739209a5681f8..581bfbab0972d 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2996,14 +2996,18 @@ class AnnotatingParser {
     const FormatToken *PrevToken = Tok.getPreviousNonComment();
     if (!PrevToken)
       return TT_UnaryOperator;
-    if (PrevToken->is(TT_TypeName))
+    if (PrevToken->isTypeName(LangOpts))
       return TT_PointerOrReference;
     if (PrevToken->isPlacementOperator() && Tok.is(tok::ampamp))
       return TT_BinaryOperator;
 
-    const FormatToken *NextToken = Tok.getNextNonComment();
+    auto *NextToken = Tok.getNextNonComment();
     if (!NextToken)
       return TT_PointerOrReference;
+    if (NextToken->is(tok::greater)) {
+      NextToken->setFinalizedType(TT_TemplateCloser);
+      return TT_PointerOrReference;
+    }
 
     if (InTemplateArgument && NextToken->is(tok::kw_noexcept))
       return TT_BinaryOperator;
@@ -3112,7 +3116,7 @@ class AnnotatingParser {
 
     // It's more likely that & represents operator& than an uninitialized
     // reference.
-    if (Tok.is(tok::amp) && PrevToken && PrevToken->Tok.isAnyIdentifier() &&
+    if (Tok.is(tok::amp) && PrevToken->Tok.isAnyIdentifier() &&
         IsChainedOperatorAmpOrMember(PrevToken->getPreviousNonComment()) &&
         NextToken && NextToken->Tok.isAnyIdentifier()) {
       if (auto NextNext = NextToken->getNextNonComment();
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index b7f52fd224ab5..67ed17bdd7b5d 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -215,8 +215,8 @@ struct ASTUnit::ASTWriterData {
   llvm::BitstreamWriter Stream;
   ASTWriter Writer;
 
-  ASTWriterData(ModuleCache &ModCache)
-      : Stream(Buffer), Writer(Stream, Buffer, ModCache, {}) {}
+  ASTWriterData(ModuleCache &ModCache, const CodeGenOptions &CGOpts)
+      : Stream(Buffer), Writer(Stream, Buffer, ModCache, CGOpts, {}) {}
 };
 
 void ASTUnit::clearFileLevelDecls() {
@@ -235,7 +235,8 @@ const unsigned DefaultPreambleRebuildInterval = 5;
 static std::atomic<unsigned> ActiveASTUnitObjects;
 
 ASTUnit::ASTUnit(bool _MainFileIsAST)
-    : MainFileIsAST(_MainFileIsAST), WantTiming(getenv("LIBCLANG_TIMING")),
+    : CodeGenOpts(std::make_unique<CodeGenOptions>()),
+      MainFileIsAST(_MainFileIsAST), WantTiming(getenv("LIBCLANG_TIMING")),
       ShouldCacheCodeCompletionResults(false),
       IncludeBriefCommentsInCodeCompletion(false), UserFilesAreVolatile(false),
       UnsafeToFree(false) {
@@ -516,6 +517,7 @@ class ASTInfoCollector : public ASTReaderListener {
   HeaderSearchOptions &HSOpts;
   PreprocessorOptions &PPOpts;
   LangOptions &LangOpt;
+  CodeGenOptions &CodeGenOpts;
   std::shared_ptr<TargetOptions> &TargetOpts;
   IntrusiveRefCntPtr<TargetInfo> &Target;
   unsigned &Counter;
@@ -525,12 +527,12 @@ class ASTInfoCollector : public ASTReaderListener {
 public:
   ASTInfoCollector(Preprocessor &PP, ASTContext *Context,
                    HeaderSearchOptions &HSOpts, PreprocessorOptions &PPOpts,
-                   LangOptions &LangOpt,
+                   LangOptions &LangOpt, CodeGenOptions &CodeGenOpts,
                    std::shared_ptr<TargetOptions> &TargetOpts,
                    IntrusiveRefCntPtr<TargetInfo> &Target, unsigned &Counter)
       : PP(PP), Context(Context), HSOpts(HSOpts), PPOpts(PPOpts),
-        LangOpt(LangOpt), TargetOpts(TargetOpts), Target(Target),
-        Counter(Counter) {}
+        LangOpt(LangOpt), CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts),
+        Target(Target), Counter(Counter) {}
 
   bool ReadLanguageOptions(const LangOptions &LangOpts,
                            StringRef ModuleFilename, bool Complain,
@@ -555,6 +557,13 @@ class ASTInfoCollector : public ASTReaderListener {
     return false;
   }
 
+  bool ReadCodeGenOptions(const CodeGenOptions &CGOpts,
+                          StringRef ModuleFilename, bool Complain,
+                          bool AllowCompatibleDifferences) override {
+    this->CodeGenOpts = CGOpts;
+    return false;
+  }
+
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
                                StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
@@ -858,14 +867,16 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
       DisableValidationForModuleKind::None;
   if (::getenv("LIBCLANG_DISABLE_PCH_VALIDATION"))
     disableValid = DisableValidationForModuleKind::All;
-  AST->Reader = new ASTReader(
-      PP, *AST->ModCache, AST->Ctx.get(), PCHContainerRdr, {}, /*isysroot=*/"",
-      /*DisableValidationKind=*/disableValid, AllowASTWithCompilerErrors);
+  AST->Reader = new ASTReader(PP, *AST->ModCache, AST->Ctx.get(),
+                              PCHContainerRdr, *AST->CodeGenOpts, {},
+                              /*isysroot=*/"",
+                              /*DisableValidationKind=*/disableValid,
+                              AllowASTWithCompilerErrors);
 
   unsigned Counter = 0;
   AST->Reader->setListener(std::make_unique<ASTInfoCollector>(
       *AST->PP, AST->Ctx.get(), *AST->HSOpts, *AST->PPOpts, *AST->LangOpts,
-      AST->TargetOpts, AST->Target, Counter));
+      *AST->CodeGenOpts, AST->TargetOpts, AST->Target, Counter));
 
   // Attach the AST reader to the AST context as an external AST
   // source, so that declarations will be deserialized from the
@@ -1836,6 +1847,7 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromCommandLine(
   AST->DiagOpts = DiagOpts;
   AST->Diagnostics = Diags;
   AST->FileSystemOpts = CI->getFileSystemOpts();
+  AST->CodeGenOpts = std::make_unique<CodeGenOptions>(CI->getCodeGenOpts());
   VFS = createVFSFromCompilerInvocation(*CI, *Diags, VFS);
   AST->FileMgr = new FileManager(AST->FileSystemOpts, VFS);
   AST->StorePreamblesInMemory = StorePreamblesInMemory;
@@ -1851,7 +1863,7 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromCommandLine(
   AST->Invocation = CI;
   AST->SkipFunctionBodies = SkipFunctionBodies;
   if (ForSerialization)
-    AST->WriterData.reset(new ASTWriterData(*AST->ModCache));
+    AST->WriterData.reset(new ASTWriterData(*AST->ModCache, *AST->CodeGenOpts));
   // Zero out now to ease cleanup during crash recovery.
   CI = nullptr;
   Diags = nullptr;
@@ -2385,7 +2397,7 @@ bool ASTUnit::serialize(raw_ostream &OS) {
   SmallString<128> Buffer;
   llvm::BitstreamWriter Stream(Buffer);
   IntrusiveRefCntPtr<ModuleCache> ModCache = createCrossProcessModuleCache();
-  ASTWriter Writer(Stream, Buffer, *ModCache, {});
+  ASTWriter Writer(Stream, Buffer, *ModCache, *CodeGenOpts, {});
   return serializeUnit(Writer, Buffer, getSema(), OS);
 }
 
diff --git a/clang/lib/Frontend/ChainedIncludesSource.cpp b/clang/lib/Frontend/ChainedIncludesSource.cpp
index f9a398dbfb90f..ba7c767f8e987 100644
--- a/clang/lib/Frontend/ChainedIncludesSource.cpp
+++ b/clang/lib/Frontend/ChainedIncludesSource.cpp
@@ -62,7 +62,7 @@ createASTReader(CompilerInstance &CI, StringRef pchFile,
   std::unique_ptr<ASTReader> Reader;
   Reader.reset(new ASTReader(
       PP, CI.getModuleCache(), &CI.getASTContext(), CI.getPCHContainerReader(),
-      /*Extensions=*/{},
+      CI.getCodeGenOpts(), /*Extensions=*/{},
       /*isysroot=*/"", DisableValidationForModuleKind::PCH));
   for (unsigned ti = 0; ti < bufNames.size(); ++ti) {
     StringRef sr(bufNames[ti]);
@@ -138,7 +138,8 @@ IntrusiveRefCntPtr<ExternalSemaSource> clang::createChainedIncludesSource(
     ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions;
     auto consumer = std::make_unique<PCHGenerator>(
         Clang->getPreprocessor(), Clang->getModuleCache(), "-", /*isysroot=*/"",
-        Buffer, Extensions, /*AllowASTWithErrors=*/true);
+        Buffer, Clang->getCodeGenOpts(), Extensions,
+        /*AllowASTWithErrors=*/true);
     Clang->getASTContext().setASTMutationListener(
                                             consumer->GetASTMutationListener());
     Clang->setASTConsumer(std::move(consumer));
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 6f8cc01eeed88..c7b82db292a14 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -614,7 +614,7 @@ void CompilerInstance::createPCHExternalASTSource(
   TheASTReader = createPCHExternalASTSource(
       Path, getHeaderSearchOpts().Sysroot, DisableValidation,
       AllowPCHWithCompilerErrors, getPreprocessor(), getModuleCache(),
-      getASTContext(), getPCHContainerReader(),
+      getASTContext(), getPCHContainerReader(), getCodeGenOpts(),
       getFrontendOpts().ModuleFileExtensions, DependencyCollectors,
       DeserializationListener, OwnDeserializationListener, Preamble,
       getFrontendOpts().UseGlobalModuleIndex);
@@ -625,6 +625,7 @@ IntrusiveRefCntPtr<ASTReader> CompilerInstance::createPCHExternalASTSource(
     DisableValidationForModuleKind DisableValidation,
     bool AllowPCHWithCompilerErrors, Preprocessor &PP, ModuleCache &ModCache,
     ASTContext &Context, const PCHContainerReader &PCHContainerRdr,
+    const CodeGenOptions &CodeGenOpts,
     ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
     ArrayRef<std::shared_ptr<DependencyCollector>> DependencyCollectors,
     void *DeserializationListener, bool OwnDeserializationListener,
@@ -633,7 +634,7 @@ IntrusiveRefCntPtr<ASTReader> CompilerInstance::createPCHExternalASTSource(
       PP.getHeaderSearchInfo().getHeaderSearchOpts();
 
   IntrusiveRefCntPtr<ASTReader> Reader(new ASTReader(
-      PP, ModCache, &Context, PCHContainerRdr, Extensions,
+      PP, ModCache, &Context, PCHContainerRdr, CodeGenOpts, Extensions,
       Sysroot.empty() ? "" : Sysroot.data(), DisableValidation,
       AllowPCHWithCompilerErrors, /*AllowConfigurationMismatch*/ false,
       HSOpts.ModulesValidateSystemHeaders,
@@ -1746,7 +1747,8 @@ void CompilerInstance::createASTReader() {
                                               "Reading modules", *timerGroup);
   TheASTReader = new ASTReader(
       getPreprocessor(), getModuleCache(), &getASTContext(),
-      getPCHContainerReader(), getFrontendOpts().ModuleFileExtensions,
+      getPCHContainerReader(), getCodeGenOpts(),
+      getFrontendOpts().ModuleFileExtensions,
       Sysroot.empty() ? "" : Sysroot.c_str(),
       PPOpts.DisablePCHOrModuleValidation,
       /*AllowASTWithCompilerErrors=*/FEOpts.AllowPCMWithCompilerErrors,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index b9f75796ecc16..3a36250da57a3 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -593,11 +593,11 @@ static bool FixupInvocation(CompilerInvocation &Invocation,
   CodeGenOpts.CodeModel = TargetOpts.CodeModel;
   CodeGenOpts.LargeDataThreshold = TargetOpts.LargeDataThreshold;
 
-  if (LangOpts.getExceptionHandling() !=
-          LangOptions::ExceptionHandlingKind::None &&
+  if (CodeGenOpts.getExceptionHandling() !=
+          CodeGenOptions::ExceptionHandlingKind::None &&
       T.isWindowsMSVCEnvironment())
     Diags.Report(diag::err_fe_invalid_exception_model)
-        << static_cast<unsigned>(LangOpts.getExceptionHandling()) << T.str();
+        << static_cast<unsigned>(CodeGenOpts.getExceptionHandling()) << T.str();
 
   if (LangOpts.AppleKext && !LangOpts.CPlusPlus)
     Diags.Report(diag::warn_c_kext);
@@ -1541,6 +1541,25 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
           Key::ASIA, LangOpts.PointerAuthInitFiniAddressDiscrimination,
           Discrimination::Constant, InitFiniPointerConstantDiscriminator);
     }
+
+    Opts.ObjCMethodListFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.ObjCMethodListPointer =
+        PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                          MethodListPointerConstantDiscriminator);
+    if (LangOpts.PointerAuthObjcIsa) {
+      Opts.ObjCIsaPointers =
+          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                            IsaPointerConstantDiscriminator);
+      Opts.ObjCSuperPointers =
+          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                            SuperPointerConstantDiscriminator);
+    }
+
+    if (LangOpts.PointerAuthObjcClassROPointers)
+      Opts.ObjCClassROPointers =
+          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                            ClassROConstantDiscriminator);
   }
   Opts.ReturnAddresses = LangOpts.PointerAuthReturns;
   Opts.AuthTraps = LangOpts.PointerAuthAuthTraps;
@@ -3573,6 +3592,12 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fptrauth_elf_got);
   if (Opts.AArch64JumpTableHardening)
     GenerateArg(Consumer, OPT_faarch64_jump_table_hardening);
+  if (Opts.PointerAuthObjcIsa)
+    GenerateArg(Consumer, OPT_fptrauth_objc_isa);
+  if (Opts.PointerAuthObjcInterfaceSel)
+    GenerateArg(Consumer, OPT_fptrauth_objc_interface_sel);
+  if (Opts.PointerAuthObjcClassROPointers)
+    GenerateArg(Consumer, OPT_fptrauth_objc_class_ro);
 }
 
 static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
@@ -3596,6 +3621,15 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthELFGOT = Args.hasArg(OPT_fptrauth_elf_got);
   Opts.AArch64JumpTableHardening =
       Args.hasArg(OPT_faarch64_jump_table_hardening);
+
+  Opts.PointerAuthObjcIsa = Args.hasArg(OPT_fptrauth_objc_isa);
+  Opts.PointerAuthObjcClassROPointers = Args.hasArg(OPT_fptrauth_objc_class_ro);
+  Opts.PointerAuthObjcInterfaceSel =
+      Args.hasArg(OPT_fptrauth_objc_interface_sel);
+
+  if (Opts.PointerAuthObjcInterfaceSel)
+    Opts.PointerAuthObjcInterfaceSelKey =
+        static_cast<unsigned>(PointerAuthSchema::ARM8_3Key::ASDB);
 }
 
 /// Check if input file kind and language standard are compatible.
@@ -3679,23 +3713,6 @@ static StringRef GetInputKindName(InputKind IK) {
   llvm_unreachable("unknown input language");
 }
 
-static StringRef getExceptionHandlingName(unsigned EHK) {
-  switch (static_cast<LangOptions::ExceptionHandlingKind>(EHK)) {
-  case LangOptions::ExceptionHandlingKind::None:
-    return "none";
-  case LangOptions::ExceptionHandlingKind::DwarfCFI:
-    return "dwarf";
-  case LangOptions::ExceptionHandlingKind::SjLj:
-    return "sjlj";
-  case LangOptions::ExceptionHandlingKind::WinEH:
-    return "seh";
-  case LangOptions::ExceptionHandlingKind::Wasm:
-    return "wasm";
-  }
-
-  llvm_unreachable("covered switch");
-}
-
 void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
                                               ArgumentConsumer Consumer,
                                               const llvm::Triple &T,
@@ -3711,10 +3728,6 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
       GenerateArg(Consumer, OPT_pic_is_pie);
     for (StringRef Sanitizer : serializeSanitizerKinds(Opts.Sanitize))
       GenerateArg(Consumer, OPT_fsanitize_EQ, Sanitizer);
-    if (Opts.ExceptionHandling) {
-      GenerateArg(Consumer, OPT_exception_model,
-                  getExceptionHandlingName(Opts.ExceptionHandling));
-    }
 
     return;
   }
@@ -3900,12 +3913,8 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
   if (Opts.OpenMPCUDAMode)
     GenerateArg(Consumer, OPT_fopenmp_cuda_mode);
 
-  if (Opts.OpenACC) {
+  if (Opts.OpenACC)
     GenerateArg(Consumer, OPT_fopenacc);
-    if (!Opts.OpenACCMacroOverride.empty())
-      GenerateArg(Consumer, OPT_openacc_macro_override,
-                  Opts.OpenACCMacroOverride);
-  }
 
   // The arguments used to set Optimize, OptimizeSize and NoInlineDefine are
   // generated from CodeGenOptions.
@@ -4023,24 +4032,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
     parseSanitizerKinds("-fsanitize=", Args.getAllArgValues(OPT_fsanitize_EQ),
                         Diags, Opts.Sanitize);
 
-    if (const Arg *A = Args.getLastArg(options::OPT_exception_model)) {
-      std::optional<LangOptions::ExceptionHandlingKind> EMValue =
-          llvm::StringSwitch<std::optional<LangOptions::ExceptionHandlingKind>>(
-              A->getValue())
-              .Case("dwarf", LangOptions::ExceptionHandlingKind::DwarfCFI)
-              .Case("sjlj", LangOptions::ExceptionHandlingKind::SjLj)
-              .Case("seh", LangOptions::ExceptionHandlingKind::WinEH)
-              .Case("wasm", LangOptions::ExceptionHandlingKind::Wasm)
-              .Case("none", LangOptions::ExceptionHandlingKind::None)
-              .Default(std::nullopt);
-      if (EMValue) {
-        Opts.ExceptionHandling = static_cast<unsigned>(*EMValue);
-      } else {
-        Diags.Report(diag::err_drv_invalid_value)
-            << A->getAsString(Args) << A->getValue();
-      }
-    }
-
     return Diags.getNumErrors() == NumErrorsBefore;
   }
 
@@ -4429,29 +4420,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
                         Args.hasArg(options::OPT_fopenmp_cuda_mode);
 
   // OpenACC Configuration.
-  if (Args.hasArg(options::OPT_fopenacc)) {
+  if (Args.hasArg(options::OPT_fopenacc))
     Opts.OpenACC = true;
 
-    if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override))
-      Opts.OpenACCMacroOverride = A->getValue();
-  }
-
-  // FIXME: Eliminate this dependency.
-  unsigned Opt = getOptimizationLevel(Args, IK, Diags),
-       OptSize = getOptimizationLevelSize(Args);
-  Opts.Optimize = Opt != 0;
-  Opts.OptimizeSize = OptSize != 0;
-
-  // This is the __NO_INLINE__ define, which just depends on things like the
-  // optimization level and -fno-inline, not actually whether the backend has
-  // inlining enabled.
-  Opts.NoInlineDefine = !Opts.Optimize;
-  if (Arg *InlineArg = Args.getLastArg(
-          options::OPT_finline_functions, options::OPT_finline_hint_functions,
-          options::OPT_fno_inline_functions, options::OPT_fno_inline))
-    if (InlineArg->getOption().matches(options::OPT_fno_inline))
-      Opts.NoInlineDefine = true;
-
   if (Arg *A = Args.getLastArg(OPT_ffp_contract)) {
     StringRef Val = A->getValue();
     if (Val == "fast")
@@ -5301,6 +5272,21 @@ std::string CompilerInvocation::getModuleHash() const {
       HBuilder.add(*Build);
   }
 
+  // Extend the signature with affecting codegen options.
+  {
+    using CK = CodeGenOptions::CompatibilityKind;
+#define CODEGENOPT(Name, Bits, Default, Compatibility)                         \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
+    HBuilder.add(CodeGenOpts->Name);
+#define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)              \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
+    HBuilder.add(static_cast<unsigned>(CodeGenOpts->get##Name()));
+#define DEBUGOPT(Name, Bits, Default, Compatibility)
+#define VALUE_DEBUGOPT(Name, Bits, Default, Compatibility)
+#define ENUM_DEBUGOPT(Name, Type, Bits, Default, Compatibility)
+#include "clang/Basic/CodeGenOptions.def"
+  }
+
   // When compiling with -gmodules, also hash -fdebug-prefix-map as it
   // affects the debug info in the PCM.
   if (getCodeGenOpts().DebugTypeExtRefs)
@@ -5311,13 +5297,13 @@ std::string CompilerInvocation::getModuleHash() const {
     // FIXME: Replace with C++20 `using enum CodeGenOptions::CompatibilityKind`.
     using CK = CodeGenOptions::CompatibilityKind;
 #define DEBUGOPT(Name, Bits, Default, Compatibility)                           \
-  if constexpr (CK::Compatibility == CK::Affecting)                            \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
     HBuilder.add(CodeGenOpts->Name);
 #define VALUE_DEBUGOPT(Name, Bits, Default, Compatibility)                     \
-  if constexpr (CK::Compatibility == CK::Affecting)                            \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
     HBuilder.add(CodeGenOpts->Name);
 #define ENUM_DEBUGOPT(Name, Type, Bits, Default, Compatibility)                \
-  if constexpr (CK::Compatibility == CK::Affecting)                            \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
     HBuilder.add(static_cast<unsigned>(CodeGenOpts->get##Name()));
 #include "clang/Basic/DebugOptions.def"
   }
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index f5996a8e1e88b..1d82fc775b28a 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -947,8 +947,9 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
         if (ASTReader::isAcceptableASTFile(
                 Dir->path(), FileMgr, CI.getModuleCache(),
                 CI.getPCHContainerReader(), CI.getLangOpts(),
-                CI.getTargetOpts(), CI.getPreprocessorOpts(),
-                SpecificModuleCachePath, /*RequireStrictOptionMatches=*/true)) {
+                CI.getCodeGenOpts(), CI.getTargetOpts(),
+                CI.getPreprocessorOpts(), SpecificModuleCachePath,
+                /*RequireStrictOptionMatches=*/true)) {
           PPOpts.ImplicitPCHInclude = std::string(Dir->path());
           Found = true;
           break;
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index 89eb0b689f85c..dcfbd5386f3dc 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -139,7 +139,7 @@ GeneratePCHAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   std::vector<std::unique_ptr<ASTConsumer>> Consumers;
   Consumers.push_back(std::make_unique<PCHGenerator>(
       CI.getPreprocessor(), CI.getModuleCache(), OutputFile, Sysroot, Buffer,
-      FrontendOpts.ModuleFileExtensions,
+      CI.getCodeGenOpts(), FrontendOpts.ModuleFileExtensions,
       CI.getPreprocessorOpts().AllowPCHWithCompilerErrors,
       FrontendOpts.IncludeTimestamps, FrontendOpts.BuildingImplicitModule,
       +CI.getLangOpts().CacheGeneratedPCH));
@@ -199,7 +199,7 @@ GenerateModuleAction::CreateMultiplexConsumer(CompilerInstance &CI,
 
   Consumers.push_back(std::make_unique<PCHGenerator>(
       CI.getPreprocessor(), CI.getModuleCache(), OutputFile, Sysroot, Buffer,
-      CI.getFrontendOpts().ModuleFileExtensions,
+      CI.getCodeGenOpts(), CI.getFrontendOpts().ModuleFileExtensions,
       /*AllowASTWithErrors=*/
       +CI.getFrontendOpts().AllowPCMWithCompilerErrors,
       /*IncludeTimestamps=*/
@@ -292,13 +292,13 @@ GenerateModuleInterfaceAction::CreateASTConsumer(CompilerInstance &CI,
       !CI.getFrontendOpts().ModuleOutputPath.empty()) {
     Consumers.push_back(std::make_unique<ReducedBMIGenerator>(
         CI.getPreprocessor(), CI.getModuleCache(),
-        CI.getFrontendOpts().ModuleOutputPath,
+        CI.getFrontendOpts().ModuleOutputPath, CI.getCodeGenOpts(),
         +CI.getFrontendOpts().AllowPCMWithCompilerErrors));
   }
 
   Consumers.push_back(std::make_unique<CXX20ModulesGenerator>(
       CI.getPreprocessor(), CI.getModuleCache(),
-      CI.getFrontendOpts().OutputFile,
+      CI.getFrontendOpts().OutputFile, CI.getCodeGenOpts(),
       +CI.getFrontendOpts().AllowPCMWithCompilerErrors));
 
   return std::make_unique<MultiplexConsumer>(std::move(Consumers));
@@ -313,9 +313,9 @@ GenerateModuleInterfaceAction::CreateOutputFile(CompilerInstance &CI,
 std::unique_ptr<ASTConsumer>
 GenerateReducedModuleInterfaceAction::CreateASTConsumer(CompilerInstance &CI,
                                                         StringRef InFile) {
-  return std::make_unique<ReducedBMIGenerator>(CI.getPreprocessor(),
-                                               CI.getModuleCache(),
-                                               CI.getFrontendOpts().OutputFile);
+  return std::make_unique<ReducedBMIGenerator>(
+      CI.getPreprocessor(), CI.getModuleCache(),
+      CI.getFrontendOpts().OutputFile, CI.getCodeGenOpts());
 }
 
 bool GenerateHeaderUnitAction::BeginSourceFileAction(CompilerInstance &CI) {
@@ -358,7 +358,8 @@ void VerifyPCHAction::ExecuteAction() {
   const std::string &Sysroot = CI.getHeaderSearchOpts().Sysroot;
   std::unique_ptr<ASTReader> Reader(new ASTReader(
       CI.getPreprocessor(), CI.getModuleCache(), &CI.getASTContext(),
-      CI.getPCHContainerReader(), CI.getFrontendOpts().ModuleFileExtensions,
+      CI.getPCHContainerReader(), CI.getCodeGenOpts(),
+      CI.getFrontendOpts().ModuleFileExtensions,
       Sysroot.empty() ? "" : Sysroot.c_str(),
       DisableValidationForModuleKind::None,
       /*AllowASTWithCompilerErrors*/ false,
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index f64613fb4a6cb..382ccd610946c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -639,16 +639,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     }
   }
 
-  if (LangOpts.OpenACC) {
-    // FIXME: When we have full support for OpenACC, we should set this to the
-    // version we support. Until then, set as '1' by default, but provide a
-    // temporary mechanism for users to override this so real-world examples can
-    // be tested against.
-    if (!LangOpts.OpenACCMacroOverride.empty())
-      Builder.defineMacro("_OPENACC", LangOpts.OpenACCMacroOverride);
-    else
-      Builder.defineMacro("_OPENACC", "1");
-  }
+  if (LangOpts.OpenACC)
+    Builder.defineMacro("_OPENACC", "202506");
 }
 
 /// Initialize the predefined C++ language feature test macros defined in
@@ -779,9 +771,6 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
   if (LangOpts.Char8)
     Builder.defineMacro("__cpp_char8_t", "202207L");
   Builder.defineMacro("__cpp_impl_destroying_delete", "201806L");
-
-  // TODO: Final number?
-  Builder.defineMacro("__cpp_type_aware_allocators", "202500L");
 }
 
 /// InitializeOpenCLFeatureTestMacros - Define OpenCL macros based on target
@@ -865,6 +854,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
                                        const LangOptions &LangOpts,
                                        const FrontendOptions &FEOpts,
                                        const PreprocessorOptions &PPOpts,
+                                       const CodeGenOptions &CGOpts,
                                        MacroBuilder &Builder) {
   // Compiler version introspection macros.
   Builder.defineMacro("__llvm__");  // LLVM Backend
@@ -1034,14 +1024,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (LangOpts.GNUCVersion && LangOpts.RTTI)
     Builder.defineMacro("__GXX_RTTI");
 
-  if (LangOpts.hasSjLjExceptions())
+  if (CGOpts.hasSjLjExceptions())
     Builder.defineMacro("__USING_SJLJ_EXCEPTIONS__");
-  else if (LangOpts.hasSEHExceptions())
+  else if (CGOpts.hasSEHExceptions())
     Builder.defineMacro("__SEH__");
-  else if (LangOpts.hasDWARFExceptions() &&
+  else if (CGOpts.hasDWARFExceptions() &&
            (TI.getTriple().isThumb() || TI.getTriple().isARM()))
     Builder.defineMacro("__ARM_DWARF_EH__");
-  else if (LangOpts.hasWasmExceptions() && TI.getTriple().isWasm())
+  else if (CGOpts.hasWasmExceptions() && TI.getTriple().isWasm())
     Builder.defineMacro("__WASM_EXCEPTIONS__");
 
   if (LangOpts.Deprecated)
@@ -1073,9 +1063,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__clang_wide_literal_encoding__", "\"UTF-16\"");
   }
 
-  if (LangOpts.Optimize)
+  if (CGOpts.OptimizationLevel != 0)
     Builder.defineMacro("__OPTIMIZE__");
-  if (LangOpts.OptimizeSize)
+  if (CGOpts.OptimizeSize != 0)
     Builder.defineMacro("__OPTIMIZE_SIZE__");
 
   if (LangOpts.FastMath)
@@ -1396,7 +1386,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (LangOpts.GNUCVersion)
     addLockFreeMacros("__GCC_ATOMIC_");
 
-  if (LangOpts.NoInlineDefine)
+  if (CGOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining)
     Builder.defineMacro("__NO_INLINE__");
 
   if (unsigned PICLevel = LangOpts.PICLevel) {
@@ -1575,10 +1565,11 @@ void clang::InitializePreprocessor(Preprocessor &PP,
     // macros. This is not the right way to handle this.
     if ((LangOpts.CUDA || LangOpts.isTargetDevice()) && PP.getAuxTargetInfo())
       InitializePredefinedMacros(*PP.getAuxTargetInfo(), LangOpts, FEOpts,
-                                 PP.getPreprocessorOpts(), Builder);
+                                 PP.getPreprocessorOpts(), CodeGenOpts,
+                                 Builder);
 
     InitializePredefinedMacros(PP.getTargetInfo(), LangOpts, FEOpts,
-                               PP.getPreprocessorOpts(), Builder);
+                               PP.getPreprocessorOpts(), CodeGenOpts, Builder);
 
     // Install definitions to make Objective-C++ ARC work well with various
     // C++ Standard Library implementations.
diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp
index 3f3fe3c9937e4..146cf903a5727 100644
--- a/clang/lib/Frontend/PrecompiledPreamble.cpp
+++ b/clang/lib/Frontend/PrecompiledPreamble.cpp
@@ -293,8 +293,9 @@ class PrecompilePreambleConsumer : public PCHGenerator {
 public:
   PrecompilePreambleConsumer(PrecompilePreambleAction &Action, Preprocessor &PP,
                              ModuleCache &ModCache, StringRef isysroot,
-                             std::shared_ptr<PCHBuffer> Buffer)
-      : PCHGenerator(PP, ModCache, "", isysroot, std::move(Buffer),
+                             std::shared_ptr<PCHBuffer> Buffer,
+                             const CodeGenOptions &CodeGenOpts)
+      : PCHGenerator(PP, ModCache, "", isysroot, std::move(Buffer), CodeGenOpts,
                      ArrayRef<std::shared_ptr<ModuleFileExtension>>(),
                      /*AllowASTWithErrors=*/true),
         Action(Action) {}
@@ -337,7 +338,8 @@ PrecompilePreambleAction::CreateASTConsumer(CompilerInstance &CI,
     Sysroot.clear();
 
   return std::make_unique<PrecompilePreambleConsumer>(
-      *this, CI.getPreprocessor(), CI.getModuleCache(), Sysroot, Buffer);
+      *this, CI.getPreprocessor(), CI.getModuleCache(), Sysroot, Buffer,
+      CI.getCodeGenOpts());
 }
 
 template <class T> bool moveOnNoError(llvm::ErrorOr<T> Val, T &Output) {
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c96d209c1fc0c..dd52498bbef4c 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -127,6 +127,7 @@ set(riscv_files
   riscv_bitmanip.h
   riscv_corev_alu.h
   riscv_crypto.h
+  riscv_nds.h
   riscv_ntlh.h
   sifive_vector.h
   andes_vector.h
@@ -347,6 +348,10 @@ set(cuda_wrapper_bits_files
   cuda_wrappers/bits/basic_string.tcc
 )
 
+set(cuda_wrapper_utility_files
+  cuda_wrappers/__utility/declval.h
+)
+
 set(ppc_wrapper_files
   ppc_wrappers/mmintrin.h
   ppc_wrappers/xmmintrin.h
@@ -443,8 +448,9 @@ endfunction(clang_generate_header)
 
 # Copy header files from the source directory to the build directory
 foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
-           ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
-	   ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files})
+           ${cuda_wrapper_utility_files} ${ppc_wrapper_files} ${openmp_wrapper_files}
+           ${zos_wrapper_files} ${hlsl_files} ${llvm_libc_wrapper_files}
+           ${llvm_offload_wrapper_files})
   copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
 endforeach( f )
 
@@ -553,7 +559,7 @@ add_header_target("arm-common-resource-headers" "${arm_common_files};${arm_commo
 # Architecture/platform specific targets
 add_header_target("arm-resource-headers" "${arm_only_files};${arm_only_generated_files}")
 add_header_target("aarch64-resource-headers" "${aarch64_only_files};${aarch64_only_generated_files}")
-add_header_target("cuda-resource-headers" "${cuda_files};${cuda_wrapper_files};${cuda_wrapper_bits_files}")
+add_header_target("cuda-resource-headers" "${cuda_files};${cuda_wrapper_files};${cuda_wrapper_bits_files};${cuda_wrapper_utility_files}")
 add_header_target("hexagon-resource-headers" "${hexagon_files}")
 add_header_target("hip-resource-headers" "${hip_files}")
 add_header_target("loongarch-resource-headers" "${loongarch_files}")
@@ -600,6 +606,11 @@ install(
   DESTINATION ${header_install_dir}/cuda_wrappers/bits
   COMPONENT clang-resource-headers)
 
+install(
+  FILES ${cuda_wrapper_utility_files}
+  DESTINATION ${header_install_dir}/cuda_wrappers/__utility
+  COMPONENT clang-resource-headers)
+
 install(
   FILES ${ppc_wrapper_files}
   DESTINATION ${header_install_dir}/ppc_wrappers
@@ -663,6 +674,12 @@ install(
   EXCLUDE_FROM_ALL
   COMPONENT cuda-resource-headers)
 
+install(
+  FILES ${cuda_wrapper_utility_files}
+  DESTINATION ${header_install_dir}/cuda_wrappers/__utility
+  EXCLUDE_FROM_ALL
+  COMPONENT cuda-resource-headers)
+
 install(
   FILES ${cuda_files}
   DESTINATION ${header_install_dir}
diff --git a/clang/lib/Headers/__clang_spirv_builtins.h b/clang/lib/Headers/__clang_spirv_builtins.h
index 9915cdfcae7cd..9c7215f506508 100644
--- a/clang/lib/Headers/__clang_spirv_builtins.h
+++ b/clang/lib/Headers/__clang_spirv_builtins.h
@@ -52,30 +52,30 @@
 // Builtin IDs and sizes
 
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_workgroups) __size_t
-    __spirv_NumWorkgroups(int);
+    __spirv_BuiltInNumWorkgroups(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_size) __size_t
-    __spirv_WorkgroupSize(int);
+    __spirv_BuiltInWorkgroupSize(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_id) __size_t
-    __spirv_WorkgroupId(int);
+    __spirv_BuiltInWorkgroupId(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_local_invocation_id) __size_t
-    __spirv_LocalInvocationId(int);
+    __spirv_BuiltInLocalInvocationId(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_invocation_id) __size_t
-    __spirv_GlobalInvocationId(int);
+    __spirv_BuiltInGlobalInvocationId(int);
 
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_size) __size_t
-    __spirv_GlobalSize(int);
+    __spirv_BuiltInGlobalSize(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_offset) __size_t
-    __spirv_GlobalOffset(int);
+    __spirv_BuiltInGlobalOffset(int);
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_size) __uint32_t
-    __spirv_SubgroupSize();
+    __spirv_BuiltInSubgroupSize();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_max_size) __uint32_t
-    __spirv_SubgroupMaxSize();
+    __spirv_BuiltInSubgroupMaxSize();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_subgroups) __uint32_t
-    __spirv_NumSubgroups();
+    __spirv_BuiltInNumSubgroups();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_id) __uint32_t
-    __spirv_SubgroupId();
+    __spirv_BuiltInSubgroupId();
 extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_local_invocation_id)
-    __uint32_t __spirv_SubgroupLocalInvocationId();
+    __uint32_t __spirv_BuiltInSubgroupLocalInvocationId();
 
 // OpGenericCastToPtrExplicit
 
diff --git a/clang/lib/Headers/cuda_wrappers/__utility/declval.h b/clang/lib/Headers/cuda_wrappers/__utility/declval.h
new file mode 100644
index 0000000000000..a8434e3601939
--- /dev/null
+++ b/clang/lib/Headers/cuda_wrappers/__utility/declval.h
@@ -0,0 +1,28 @@
+#ifndef __CUDA_WRAPPERS_UTILITY_DECLVAL_H__
+#define __CUDA_WRAPPERS_UTILITY_DECLVAL_H__
+
+#include_next <__utility/declval.h>
+
+// The stuff below is the exact copy of the <__utility/declval.h>,
+// but with __device__ attribute applied to the functions, so it works on a GPU.
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// Suppress deprecation notice for volatile-qualified return type resulting
+// from volatile-qualified types _Tp.
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+template <class _Tp> __attribute__((device)) _Tp &&__declval(int);
+template <class _Tp> __attribute__((device)) _Tp __declval(long);
+_LIBCPP_SUPPRESS_DEPRECATED_POP
+
+template <class _Tp>
+__attribute__((device)) _LIBCPP_HIDE_FROM_ABI decltype(std::__declval<_Tp>(0))
+declval() _NOEXCEPT {
+  static_assert(!__is_same(_Tp, _Tp),
+                "std::declval can only be used in an unevaluated context. "
+                "It's likely that your current usage is trying to extract a "
+                "value from the function.");
+}
+
+_LIBCPP_END_NAMESPACE_STD
+#endif // __CUDA_WRAPPERS_UTILITY_DECLVAL_H__
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
index 4eb7b8f45c85a..e8ccccb489815 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h
@@ -71,6 +71,16 @@ constexpr vector<T, L> reflect_vec_impl(vector<T, L> I, vector<T, L> N) {
 #endif
 }
 
+template <typename T, typename U> constexpr T refract_impl(T I, T N, U Eta) {
+#if (__has_builtin(__builtin_spirv_refract))
+  return __builtin_spirv_refract(I, N, Eta);
+#endif
+  T Mul = dot(N, I);
+  T K = 1 - Eta * Eta * (1 - Mul * Mul);
+  T Result = (Eta * I - (Eta * Mul + sqrt(K)) * N);
+  return select<T>(K < 0, static_cast<T>(0), Result);
+}
+
 template <typename T> constexpr T fmod_impl(T X, T Y) {
 #if !defined(__DIRECTX__)
   return __builtin_elementwise_fmod(X, Y);
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index ea880105fac3b..499a05328ee4f 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -475,6 +475,65 @@ reflect(__detail::HLSL_FIXED_VECTOR<float, L> I,
   return __detail::reflect_vec_impl(I, N);
 }
 
+//===----------------------------------------------------------------------===//
+// refract builtin
+//===----------------------------------------------------------------------===//
+
+/// \fn T refract(T I, T N, T eta)
+/// \brief Returns a refraction using an entering ray, \a I, a surface
+/// normal, \a N and refraction index \a eta
+/// \param I The entering ray.
+/// \param N The surface normal.
+/// \param eta The refraction index.
+///
+/// The return value is a floating-point vector that represents the refraction
+/// using the refraction index, \a eta, for the direction of the entering ray,
+/// \a I, off a surface with the normal \a N.
+///
+/// This function calculates the refraction vector using the following formulas:
+/// k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I))
+/// if k < 0.0 the result is 0.0
+/// otherwise, the result is eta * I - (eta * dot(N, I) + sqrt(k)) * N
+///
+/// I and N must already be normalized in order to achieve the desired result.
+///
+/// I and N must be a scalar or vector whose component type is
+/// floating-point.
+///
+/// eta must be a 16-bit or 32-bit floating-point scalar.
+///
+/// Result type, the type of I, and the type of N must all be the same type.
+
+template <typename T>
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+const inline __detail::enable_if_t<__detail::is_arithmetic<T>::Value &&
+                                       __detail::is_same<half, T>::value,
+                                   T> refract(T I, T N, T eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
+template <typename T>
+const inline __detail::enable_if_t<
+    __detail::is_arithmetic<T>::Value && __detail::is_same<float, T>::value, T>
+refract(T I, T N, T eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
+template <int L>
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+const inline __detail::HLSL_FIXED_VECTOR<half, L> refract(
+    __detail::HLSL_FIXED_VECTOR<half, L> I,
+    __detail::HLSL_FIXED_VECTOR<half, L> N, half eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
+template <int L>
+const inline __detail::HLSL_FIXED_VECTOR<float, L>
+refract(__detail::HLSL_FIXED_VECTOR<float, L> I,
+        __detail::HLSL_FIXED_VECTOR<float, L> N, float eta) {
+  return __detail::refract_impl(I, N, eta);
+}
+
 //===----------------------------------------------------------------------===//
 // smoothstep builtin
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
index d489a67c533d4..7f7d387cbdfda 100644
--- a/clang/lib/Headers/ptrauth.h
+++ b/clang/lib/Headers/ptrauth.h
@@ -42,6 +42,19 @@ typedef enum {
      The extra data is always 0. */
   ptrauth_key_cxx_vtable_pointer = ptrauth_key_process_independent_data,
 
+  /* The key used to sign metadata pointers to Objective-C method-lists. */
+  ptrauth_key_method_list_pointer = ptrauth_key_asda,
+
+  /* The key used to sign Objective-C isa and super pointers. */
+  ptrauth_key_objc_isa_pointer = ptrauth_key_process_independent_data,
+  ptrauth_key_objc_super_pointer = ptrauth_key_process_independent_data,
+
+  /* The key used to sign selector pointers */
+  ptrauth_key_objc_sel_pointer = ptrauth_key_process_dependent_data,
+
+  /* The key used to sign Objective-C class_ro_t pointers. */
+  ptrauth_key_objc_class_ro_pointer = ptrauth_key_process_independent_data,
+
   /* The key used to sign pointers in ELF .init_array/.fini_array. */
   ptrauth_key_init_fini_pointer = ptrauth_key_process_independent_code,
 
@@ -259,6 +272,46 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 /* The value is ptrauth_string_discriminator("init_fini") */
 #define __ptrauth_init_fini_discriminator 0xd9d4
 
+/* Objective-C pointer auth ABI qualifiers */
+#define __ptrauth_objc_method_list_imp                                         \
+  __ptrauth(ptrauth_key_function_pointer, 1, 0)
+
+#if __has_feature(ptrauth_objc_method_list_pointer)
+#define __ptrauth_objc_method_list_pointer                                     \
+  __ptrauth(ptrauth_key_method_list_pointer, 1, 0xC310)
+#else
+#define __ptrauth_objc_method_list_pointer
+#endif
+
+#define __ptrauth_isa_discriminator 0x6AE1
+#define __ptrauth_super_discriminator 0xB5AB
+#define __ptrauth_objc_isa_pointer                                             \
+  __ptrauth(ptrauth_key_objc_isa_pointer, 1, __ptrauth_isa_discriminator)
+#if __has_feature(ptrauth_restricted_intptr_qualifier)
+#define __ptrauth_objc_isa_uintptr                                             \
+  __ptrauth_restricted_intptr(ptrauth_key_objc_isa_pointer, 1,                 \
+                              __ptrauth_isa_discriminator)
+#else
+#define __ptrauth_objc_isa_uintptr                                             \
+  __ptrauth(ptrauth_key_objc_isa_pointer, 1, __ptrauth_isa_discriminator)
+#endif
+
+#define __ptrauth_objc_super_pointer                                           \
+  __ptrauth(ptrauth_key_objc_super_pointer, 1, __ptrauth_super_discriminator)
+
+#define __ptrauth_objc_sel_discriminator 0x57c2
+#if __has_feature(ptrauth_objc_interface_sel)
+#define __ptrauth_objc_sel                                                     \
+  __ptrauth(ptrauth_key_objc_sel_pointer, 1, __ptrauth_objc_sel_discriminator)
+#else
+#define __ptrauth_objc_sel
+#endif
+
+#define __ptrauth_objc_class_ro_discriminator 0x61f8
+#define __ptrauth_objc_class_ro                                                \
+  __ptrauth(ptrauth_key_objc_class_ro_pointer, 1,                              \
+            __ptrauth_objc_class_ro_discriminator)
+
 #else
 
 #define ptrauth_strip(__value, __key)                                          \
@@ -331,6 +384,10 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 #define ptrauth_cxx_vtable_pointer(key, address_discrimination,                \
                                    extra_discrimination...)
 
+#define __ptrauth_objc_isa_pointer
+#define __ptrauth_objc_isa_uintptr
+#define __ptrauth_objc_super_pointer
+
 #endif /* __has_feature(ptrauth_intrinsics) */
 
 #endif /* __PTRAUTH_H */
diff --git a/clang/lib/Headers/riscv_nds.h b/clang/lib/Headers/riscv_nds.h
new file mode 100644
index 0000000000000..5ccef00e332ed
--- /dev/null
+++ b/clang/lib/Headers/riscv_nds.h
@@ -0,0 +1,35 @@
+/*===---- riscv_nds.h - Andes intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_NDS_H
+#define __RISCV_NDS_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__riscv_xandesbfhcvt)
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ float __DEFAULT_FN_ATTRS __riscv_nds_fcvt_s_bf16(__bf16 bf) {
+  return __builtin_riscv_nds_fcvt_s_bf16(bf);
+}
+
+static __inline__ __bf16 __DEFAULT_FN_ATTRS __riscv_nds_fcvt_bf16_s(float sf) {
+  return __builtin_riscv_nds_fcvt_bf16_s(sf);
+}
+
+#endif // defined(__riscv_xandesbfhcvt)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // define __RISCV_NDS_H
diff --git a/clang/lib/Index/IndexTypeSourceInfo.cpp b/clang/lib/Index/IndexTypeSourceInfo.cpp
index 98b5513128fbe..adc33b3abd822 100644
--- a/clang/lib/Index/IndexTypeSourceInfo.cpp
+++ b/clang/lib/Index/IndexTypeSourceInfo.cpp
@@ -271,10 +271,6 @@ void IndexingContext::indexNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS,
     handleReference(NNS.getNestedNameSpecifier()->getAsNamespace(),
                     Loc, Parent, DC, SymbolRoleSet());
     break;
-  case NestedNameSpecifier::NamespaceAlias:
-    handleReference(NNS.getNestedNameSpecifier()->getAsNamespaceAlias(),
-                    Loc, Parent, DC, SymbolRoleSet());
-    break;
 
   case NestedNameSpecifier::TypeSpec:
     indexTypeLoc(NNS.getTypeLoc(), Parent, DC);
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 3e22b4001bde7..3e5f947a82cc3 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -222,7 +222,6 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
   case llvm::Triple::Linux:
   case llvm::Triple::LiteOS:
   case llvm::Triple::Managarm:
-  case llvm::Triple::NaCl:
   case llvm::Triple::NetBSD:
   case llvm::Triple::OpenBSD:
   case llvm::Triple::PS4:
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 9fe18b0dbcedb..31392d1dd8d4b 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -591,8 +591,7 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context,
        NextToken().isRegularKeywordAttribute() ||
        NextToken().is(tok::kw___attribute)) &&
       D.SS.isNotEmpty() && LastII == Tok.getIdentifierInfo() &&
-      !D.SS.getScopeRep()->getAsNamespace() &&
-      !D.SS.getScopeRep()->getAsNamespaceAlias()) {
+      D.SS.getScopeRep()->getKind() != NestedNameSpecifier::Namespace) {
     SourceLocation IdLoc = ConsumeToken();
     ParsedType Type =
         Actions.getInheritingConstructorName(D.SS, IdLoc, *LastII);
@@ -1133,13 +1132,6 @@ void Parser::AnnotateExistingDecltypeSpecifier(const DeclSpec &DS,
   // make sure we have a token we can turn into an annotation token
   if (PP.isBacktrackEnabled()) {
     PP.RevertCachedTokens(1);
-    if (DS.getTypeSpecType() == TST_error) {
-      // We encountered an error in parsing 'decltype(...)' so lets annotate all
-      // the tokens in the backtracking cache - that we likely had to skip over
-      // to get to a token that allows us to resume parsing, such as a
-      // semi-colon.
-      EndLoc = PP.getLastCachedTokenLocation();
-    }
   } else
     PP.EnterToken(Tok, /*IsReinject*/ true);
 
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index a4e5519f13d18..d1400cbfc884d 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -37,6 +37,7 @@
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
 #include "clang/Analysis/CFGStmtMap.h"
+#include "clang/Analysis/FlowSensitive/DataflowWorklist.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/SourceLocation.h"
@@ -46,6 +47,7 @@
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -401,6 +403,142 @@ static bool isNoexcept(const FunctionDecl *FD) {
   return false;
 }
 
+/// Checks if the given expression is a reference to a function with
+/// 'noreturn' attribute.
+static bool isReferenceToNoReturn(const Expr *E) {
+  if (auto *DRef = dyn_cast<DeclRefExpr>(E->IgnoreParenCasts()))
+    if (auto *FD = dyn_cast<FunctionDecl>(DRef->getDecl()))
+      return FD->isNoReturn();
+  return false;
+}
+
+/// Checks if the given variable, which is assumed to be a function pointer, is
+/// initialized with a function having 'noreturn' attribute.
+static bool isInitializedWithNoReturn(const VarDecl *VD) {
+  if (const Expr *Init = VD->getInit()) {
+    if (auto *ListInit = dyn_cast<InitListExpr>(Init);
+        ListInit && ListInit->getNumInits() > 0)
+      Init = ListInit->getInit(0);
+    return isReferenceToNoReturn(Init);
+  }
+  return false;
+}
+
+namespace {
+
+/// Looks for statements, that can define value of the given variable.
+struct TransferFunctions : public StmtVisitor<TransferFunctions> {
+  const VarDecl *Var;
+  std::optional<bool> AllValuesAreNoReturn;
+
+  TransferFunctions(const VarDecl *VD) : Var(VD) {}
+
+  void reset() { AllValuesAreNoReturn = std::nullopt; }
+
+  void VisitDeclStmt(DeclStmt *DS) {
+    for (auto *DI : DS->decls())
+      if (auto *VD = dyn_cast<VarDecl>(DI))
+        if (VarDecl *Def = VD->getDefinition())
+          if (Def == Var)
+            AllValuesAreNoReturn = isInitializedWithNoReturn(Def);
+  }
+
+  void VisitUnaryOperator(UnaryOperator *UO) {
+    if (UO->getOpcode() == UO_AddrOf) {
+      if (auto *DRef =
+              dyn_cast<DeclRefExpr>(UO->getSubExpr()->IgnoreParenCasts()))
+        if (DRef->getDecl() == Var)
+          AllValuesAreNoReturn = false;
+    }
+  }
+
+  void VisitBinaryOperator(BinaryOperator *BO) {
+    if (BO->getOpcode() == BO_Assign)
+      if (auto *DRef = dyn_cast<DeclRefExpr>(BO->getLHS()->IgnoreParenCasts()))
+        if (DRef->getDecl() == Var)
+          AllValuesAreNoReturn = isReferenceToNoReturn(BO->getRHS());
+  }
+
+  void VisitCallExpr(CallExpr *CE) {
+    for (CallExpr::arg_iterator I = CE->arg_begin(), E = CE->arg_end(); I != E;
+         ++I) {
+      const Expr *Arg = *I;
+      if (Arg->isGLValue() && !Arg->getType().isConstQualified())
+        if (auto *DRef = dyn_cast<DeclRefExpr>(Arg->IgnoreParenCasts()))
+          if (auto VD = dyn_cast<VarDecl>(DRef->getDecl()))
+            if (VD->getDefinition() == Var)
+              AllValuesAreNoReturn = false;
+    }
+  }
+};
+} // namespace
+
+// Checks if all possible values of the given variable are functions with
+// 'noreturn' attribute.
+static bool areAllValuesNoReturn(const VarDecl *VD, const CFGBlock &VarBlk,
+                                 AnalysisDeclContext &AC) {
+  // The set of possible values of a constant variable is determined by
+  // its initializer, unless it is a function parameter.
+  if (!isa<ParmVarDecl>(VD) && VD->getType().isConstant(AC.getASTContext())) {
+    if (const VarDecl *Def = VD->getDefinition())
+      return isInitializedWithNoReturn(Def);
+    return false;
+  }
+
+  // In multithreaded environment the value of a global variable may be changed
+  // asynchronously.
+  if (!VD->getDeclContext()->isFunctionOrMethod())
+    return false;
+
+  // Check the condition "all values are noreturn". It is satisfied if the
+  // variable is set to "noreturn" value in the current block or all its
+  // predecessors satisfies the condition.
+  using MapTy = llvm::DenseMap<const CFGBlock *, std::optional<bool>>;
+  using ValueTy = MapTy::value_type;
+  MapTy BlocksToCheck;
+  BlocksToCheck[&VarBlk] = std::nullopt;
+  const auto BlockSatisfiesCondition = [](ValueTy Item) {
+    return Item.getSecond().value_or(false);
+  };
+
+  TransferFunctions TF(VD);
+  BackwardDataflowWorklist Worklist(*AC.getCFG(), AC);
+  Worklist.enqueueBlock(&VarBlk);
+  while (const CFGBlock *B = Worklist.dequeue()) {
+    // First check the current block.
+    for (CFGBlock::const_reverse_iterator ri = B->rbegin(), re = B->rend();
+         ri != re; ++ri) {
+      if (std::optional<CFGStmt> cs = ri->getAs<CFGStmt>()) {
+        const Stmt *S = cs->getStmt();
+        TF.reset();
+        TF.Visit(const_cast<Stmt *>(S));
+        if (TF.AllValuesAreNoReturn) {
+          if (!TF.AllValuesAreNoReturn.value())
+            return false;
+          BlocksToCheck[B] = true;
+          break;
+        }
+      }
+    }
+
+    // If all checked blocks satisfy the condition, the check is finished.
+    if (llvm::all_of(BlocksToCheck, BlockSatisfiesCondition))
+      return true;
+
+    // If this block does not contain the variable definition, check
+    // its predecessors.
+    if (!BlocksToCheck[B]) {
+      Worklist.enqueuePredecessors(B);
+      BlocksToCheck.erase(B);
+      for (const auto &PredBlk : B->preds())
+        if (!BlocksToCheck.contains(PredBlk))
+          BlocksToCheck[PredBlk] = std::nullopt;
+    }
+  }
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Check for missing return value.
 //===----------------------------------------------------------------------===//
@@ -527,6 +665,17 @@ static ControlFlowKind CheckFallThrough(AnalysisDeclContext &AC) {
       HasAbnormalEdge = true;
       continue;
     }
+    if (auto *Call = dyn_cast<CallExpr>(S)) {
+      const Expr *Callee = Call->getCallee();
+      if (Callee->getType()->isPointerType())
+        if (auto *DeclRef =
+                dyn_cast<DeclRefExpr>(Callee->IgnoreParenImpCasts()))
+          if (auto *VD = dyn_cast<VarDecl>(DeclRef->getDecl()))
+            if (areAllValuesNoReturn(VD, B, AC)) {
+              HasAbnormalEdge = true;
+              continue;
+            }
+    }
 
     HasPlainEdge = true;
   }
@@ -994,6 +1143,14 @@ static bool DiagnoseUninitializedConstRefUse(Sema &S, const VarDecl *VD,
   return !S.getDiagnostics().isLastDiagnosticIgnored();
 }
 
+/// Diagnose uninitialized const pointer usages.
+static bool DiagnoseUninitializedConstPtrUse(Sema &S, const VarDecl *VD,
+                                             const UninitUse &Use) {
+  S.Diag(Use.getUser()->getBeginLoc(), diag::warn_uninit_const_pointer)
+      << VD->getDeclName() << Use.getUser()->getSourceRange();
+  return !S.getDiagnostics().isLastDiagnosticIgnored();
+}
+
 /// DiagnoseUninitializedUse -- Helper function for diagnosing uses of an
 /// uninitialized variable. This manages the different forms of diagnostic
 /// emitted for particular types of uses. Returns true if the use was diagnosed
@@ -1599,9 +1756,9 @@ class UninitValsDiagReporter : public UninitVariablesHandler {
     // a stable ordering.
     llvm::sort(*vec, [](const UninitUse &a, const UninitUse &b) {
       // Prefer the direct use of an uninitialized variable over its use via
-      // constant reference.
-      if (a.isConstRefUse() != b.isConstRefUse())
-        return b.isConstRefUse();
+      // constant reference or pointer.
+      if (a.isConstRefOrPtrUse() != b.isConstRefOrPtrUse())
+        return b.isConstRefOrPtrUse();
       // Prefer a more confident report over a less confident one.
       if (a.getKind() != b.getKind())
         return a.getKind() > b.getKind();
@@ -1612,6 +1769,9 @@ class UninitValsDiagReporter : public UninitVariablesHandler {
       if (U.isConstRefUse()) {
         if (DiagnoseUninitializedConstRefUse(S, vd, U))
           return;
+      } else if (U.isConstPtrUse()) {
+        if (DiagnoseUninitializedConstPtrUse(S, vd, U))
+          return;
       } else {
         // If we have self-init, downgrade all uses to 'may be uninitialized'.
         UninitUse Use = hasSelfInit ? UninitUse(U.getUser(), false) : U;
@@ -1951,11 +2111,26 @@ class ThreadSafetyReporter : public clang::threadSafety::ThreadSafetyHandler {
 
   void handleNoMutexHeld(const NamedDecl *D, ProtectedOperationKind POK,
                          AccessKind AK, SourceLocation Loc) override {
-    assert((POK == POK_VarAccess || POK == POK_VarDereference) &&
-           "Only works for variables");
-    unsigned DiagID = POK == POK_VarAccess?
-                        diag::warn_variable_requires_any_lock:
-                        diag::warn_var_deref_requires_any_lock;
+    unsigned DiagID = 0;
+    switch (POK) {
+    case POK_VarAccess:
+    case POK_PassByRef:
+    case POK_ReturnByRef:
+    case POK_PassPointer:
+    case POK_ReturnPointer:
+      DiagID = diag::warn_variable_requires_any_lock;
+      break;
+    case POK_VarDereference:
+    case POK_PtPassByRef:
+    case POK_PtReturnByRef:
+    case POK_PtPassPointer:
+    case POK_PtReturnPointer:
+      DiagID = diag::warn_var_deref_requires_any_lock;
+      break;
+    case POK_FunctionCall:
+      llvm_unreachable("Only works for variables");
+      break;
+    }
     PartialDiagnosticAt Warning(Loc, S.PDiag(DiagID)
       << D << getLockKindFromAccessKind(AK));
     Warnings.emplace_back(std::move(Warning), getNotes());
@@ -2828,7 +3003,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   if (!Diags.isIgnored(diag::warn_uninit_var, D->getBeginLoc()) ||
       !Diags.isIgnored(diag::warn_sometimes_uninit_var, D->getBeginLoc()) ||
       !Diags.isIgnored(diag::warn_maybe_uninit_var, D->getBeginLoc()) ||
-      !Diags.isIgnored(diag::warn_uninit_const_reference, D->getBeginLoc())) {
+      !Diags.isIgnored(diag::warn_uninit_const_reference, D->getBeginLoc()) ||
+      !Diags.isIgnored(diag::warn_uninit_const_pointer, D->getBeginLoc())) {
     if (CFG *cfg = AC.getCFG()) {
       UninitValsDiagReporter reporter(S);
       UninitVariablesAnalysisStats stats;
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index ee5a862c32509..f0f1d66f66e93 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -72,7 +72,7 @@ void CXXScopeSpec::Extend(ASTContext &Context, IdentifierInfo *Identifier,
          "NestedNameSpecifierLoc range computation incorrect");
 }
 
-void CXXScopeSpec::Extend(ASTContext &Context, NamespaceDecl *Namespace,
+void CXXScopeSpec::Extend(ASTContext &Context, NamespaceBaseDecl *Namespace,
                           SourceLocation NamespaceLoc,
                           SourceLocation ColonColonLoc) {
   Builder.Extend(Context, Namespace, NamespaceLoc, ColonColonLoc);
@@ -85,19 +85,6 @@ void CXXScopeSpec::Extend(ASTContext &Context, NamespaceDecl *Namespace,
          "NestedNameSpecifierLoc range computation incorrect");
 }
 
-void CXXScopeSpec::Extend(ASTContext &Context, NamespaceAliasDecl *Alias,
-                          SourceLocation AliasLoc,
-                          SourceLocation ColonColonLoc) {
-  Builder.Extend(Context, Alias, AliasLoc, ColonColonLoc);
-
-  if (Range.getBegin().isInvalid())
-    Range.setBegin(AliasLoc);
-  Range.setEnd(ColonColonLoc);
-
-  assert(Range == Builder.getSourceRange() &&
-         "NestedNameSpecifierLoc range computation incorrect");
-}
-
 void CXXScopeSpec::MakeGlobal(ASTContext &Context,
                               SourceLocation ColonColonLoc) {
   Builder.MakeGlobal(Context, ColonColonLoc);
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index e6414a623b929..c23c98aa3aaeb 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
 
   switch (BuiltinID) {
   case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
+  case AMDGPU::BI__builtin_amdgcn_struct_ptr_buffer_load_lds:
   case AMDGPU::BI__builtin_amdgcn_load_to_lds:
   case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
     constexpr const int SizeIdx = 2;
diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
index ab83f625d2849..6ac04837708f6 100644
--- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
@@ -137,10 +137,7 @@ DeclContext *Sema::computeDeclContext(const CXXScopeSpec &SS,
     llvm_unreachable("Dependent nested-name-specifier has no DeclContext");
 
   case NestedNameSpecifier::Namespace:
-    return NNS->getAsNamespace();
-
-  case NestedNameSpecifier::NamespaceAlias:
-    return NNS->getAsNamespaceAlias()->getNamespace();
+    return NNS->getAsNamespace()->getNamespace();
 
   case NestedNameSpecifier::TypeSpec: {
     const TagType *Tag = NNS->getAsType()->getAs<TagType>();
@@ -992,7 +989,6 @@ bool Sema::ShouldEnterDeclaratorScope(Scope *S, const CXXScopeSpec &SS) {
   switch (Qualifier->getKind()) {
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
     // These are always namespace scopes.  We never want to enter a
     // namespace scope from anything but a file context.
     return CurContext->getRedeclContext()->isFileContext();
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 834417f8e15ac..5205ca0bca6fa 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -925,7 +925,12 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
       ND && ND->isFunctionOrFunctionTemplate()) {
     ScopeForParameters.emplace(S, /*CombineWithOuterScope=*/true);
     const FunctionDecl *FD = ND->getAsFunction();
+    if (FunctionTemplateDecl *Template = FD->getDescribedFunctionTemplate();
+        Template && Template->getInstantiatedFromMemberTemplate())
+      FD = Template->getInstantiatedFromMemberTemplate()->getTemplatedDecl();
     for (auto *PVD : FD->parameters()) {
+      if (ScopeForParameters->getInstantiationOfIfExists(PVD))
+        continue;
       if (!PVD->isParameterPack()) {
         ScopeForParameters->InstantiatedLocal(PVD, PVD);
         continue;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index d7234e269f645..14403e65e8f42 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -8059,7 +8059,7 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     NewVD->setInvalidDecl();
 
   // Handle GNU asm-label extension (encoded as an attribute).
-  if (Expr *E = (Expr*)D.getAsmLabel()) {
+  if (Expr *E = D.getAsmLabel()) {
     // The parser guarantees this is a string.
     StringLiteral *SE = cast<StringLiteral>(E);
     StringRef Label = SE->getString();
@@ -10333,7 +10333,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
                        isFunctionTemplateSpecialization);
 
   // Handle GNU asm-label extension (encoded as an attribute).
-  if (Expr *E = (Expr*) D.getAsmLabel()) {
+  if (Expr *E = D.getAsmLabel()) {
     // The parser guarantees this is a string.
     StringLiteral *SE = cast<StringLiteral>(E);
     NewFD->addAttr(AsmLabelAttr::Create(Context, SE->getString(),
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5f481ed1f7139..78f4804202ddc 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6247,7 +6247,7 @@ static void handleInterruptAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 static void handleLayoutVersion(Sema &S, Decl *D, const ParsedAttr &AL) {
   uint32_t Version;
-  Expr *VersionExpr = static_cast<Expr *>(AL.getArgAsExpr(0));
+  Expr *VersionExpr = AL.getArgAsExpr(0);
   if (!S.checkUInt32Argument(AL, AL.getArgAsExpr(0), Version))
     return;
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 5cc92ebb0171f..f5b4614576086 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11930,7 +11930,7 @@ Decl *Sema::ActOnStartNamespaceDef(Scope *NamespcScope,
 
 /// getNamespaceDecl - Returns the namespace a decl represents. If the decl
 /// is a namespace alias, returns the namespace it points to.
-static inline NamespaceDecl *getNamespaceDecl(NamedDecl *D) {
+static inline NamespaceDecl *getNamespaceDecl(NamespaceBaseDecl *D) {
   if (NamespaceAliasDecl *AD = dyn_cast_or_null<NamespaceAliasDecl>(D))
     return AD->getNamespace();
   return dyn_cast_or_null<NamespaceDecl>(D);
@@ -13829,7 +13829,7 @@ Decl *Sema::ActOnNamespaceAliasDef(Scope *S, SourceLocation NamespaceLoc,
     }
   }
   assert(!R.isAmbiguous() && !R.empty());
-  NamedDecl *ND = R.getRepresentativeDecl();
+  auto *ND = cast<NamespaceBaseDecl>(R.getRepresentativeDecl());
 
   // Check if we have a previous declaration with the same name.
   LookupResult PrevR(*this, Alias, AliasLoc, LookupOrdinaryName,
@@ -16541,12 +16541,8 @@ static inline bool CheckOperatorNewDeleteTypes(
   if (IsPotentiallyTypeAware) {
     // We don't emit this diagnosis for template instantiations as we will
     // have already emitted it for the original template declaration.
-    if (!FnDecl->isTemplateInstantiation()) {
-      unsigned DiagID = SemaRef.getLangOpts().CPlusPlus26
-                            ? diag::warn_cxx26_type_aware_allocators
-                            : diag::ext_cxx26_type_aware_allocators;
-      SemaRef.Diag(FnDecl->getLocation(), DiagID);
-    }
+    if (!FnDecl->isTemplateInstantiation())
+      SemaRef.Diag(FnDecl->getLocation(), diag::warn_ext_type_aware_allocators);
 
     if (OperatorKind == AllocationOperatorKind::New) {
       SizeParameterIndex = 1;
@@ -18686,7 +18682,7 @@ bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New,
         case FunctionEffectDiff::OverrideResult::NoAction:
           break;
         case FunctionEffectDiff::OverrideResult::Warn:
-          Diag(New->getLocation(), diag::warn_mismatched_func_effect_override)
+          Diag(New->getLocation(), diag::warn_conflicting_func_effect_override)
               << Diff.effectName();
           Diag(Old->getLocation(), diag::note_overridden_virtual_function)
               << Old->getReturnTypeSourceRange();
@@ -18699,6 +18695,14 @@ bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New,
           QualType ModQT = Context.getFunctionType(NewFT->getReturnType(),
                                                    NewFT->getParamTypes(), EPI);
           New->setType(ModQT);
+          if (Errs.empty()) {
+            // A warning here is somewhat pedantic. Skip this if there was
+            // already a merge conflict, which is more serious.
+            Diag(New->getLocation(), diag::warn_mismatched_func_effect_override)
+                << Diff.effectName();
+            Diag(Old->getLocation(), diag::note_overridden_virtual_function)
+                << Old->getReturnTypeSourceRange();
+          }
           break;
         }
         }
diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
index 9dbc3efbca405..bbd104909956f 100644
--- a/clang/lib/Sema/SemaDeclObjC.cpp
+++ b/clang/lib/Sema/SemaDeclObjC.cpp
@@ -5585,6 +5585,14 @@ Decl *SemaObjC::ActOnIvar(Scope *S, SourceLocation DeclStart, Declarator &D,
 
   TypeSourceInfo *TInfo = SemaRef.GetTypeForDeclarator(D);
   QualType T = TInfo->getType();
+  ASTContext &Context = getASTContext();
+  if (Context.getLangOpts().PointerAuthObjcInterfaceSel &&
+      !T.getPointerAuth()) {
+    if (Context.isObjCSelType(T.getUnqualifiedType())) {
+      if (auto PAQ = Context.getObjCMemberSelTypePtrAuth())
+        T = Context.getPointerAuthType(T, PAQ);
+    }
+  }
 
   if (BitWidth) {
     // 6.7.2.1p3, 6.7.2.1p4
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 25afa2f4dfe7a..fd95f4ec54229 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -518,7 +518,6 @@ bool Sema::checkLiteralOperatorId(const CXXScopeSpec &SS,
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
     return false;
   }
 
@@ -3477,8 +3476,8 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name,
     }
   }
 
-  FunctionProtoType::ExtProtoInfo EPI(Context.getDefaultCallingConvention(
-      /*IsVariadic=*/false, /*IsCXXMethod=*/false, /*IsBuiltin=*/true));
+  FunctionProtoType::ExtProtoInfo EPI(
+      Context.getTargetInfo().getDefaultCallingConv());
 
   QualType BadAllocType;
   bool HasBadAllocExceptionSpec = Name.isAnyOperatorNew();
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 4875c25c76988..9276554bebf9d 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1150,7 +1150,7 @@ bool SemaHLSL::handleRootSignatureElements(
       if (!llvm::hlsl::rootsig::verifyMaxAnisotropy(Sampler->MaxAnisotropy))
         ReportError(Loc, 0, 16);
       if (!llvm::hlsl::rootsig::verifyMipLODBias(Sampler->MipLODBias))
-        ReportFloatError(Loc, -16.f, 15.99);
+        ReportFloatError(Loc, -16.f, 15.99f);
     } else if (const auto *Clause =
                    std::get_if<llvm::hlsl::rootsig::DescriptorTableClause>(
                        &Elem)) {
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 95746b35f71ef..1c6f292454ed6 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -3572,7 +3572,7 @@ ExprResult Sema::ActOnDesignatedInitializer(Designation &Desig,
       Designators.push_back(ASTDesignator::CreateFieldDesignator(
           D.getFieldDecl(), D.getDotLoc(), D.getFieldLoc()));
     } else if (D.isArrayDesignator()) {
-      Expr *Index = static_cast<Expr *>(D.getArrayIndex());
+      Expr *Index = D.getArrayIndex();
       llvm::APSInt IndexValue;
       if (!Index->isTypeDependent() && !Index->isValueDependent())
         Index = CheckArrayDesignatorExpr(*this, Index, IndexValue).get();
@@ -3584,8 +3584,8 @@ ExprResult Sema::ActOnDesignatedInitializer(Designation &Desig,
         InitExpressions.push_back(Index);
       }
     } else if (D.isArrayRangeDesignator()) {
-      Expr *StartIndex = static_cast<Expr *>(D.getArrayRangeStart());
-      Expr *EndIndex = static_cast<Expr *>(D.getArrayRangeEnd());
+      Expr *StartIndex = D.getArrayRangeStart();
+      Expr *EndIndex = D.getArrayRangeEnd();
       llvm::APSInt StartValue;
       llvm::APSInt EndValue;
       bool StartDependent = StartIndex->isTypeDependent() ||
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index aa7191d2814fe..8bde18f64f80b 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -22,6 +22,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/ModuleLoader.h"
 #include "clang/Lex/Preprocessor.h"
@@ -777,7 +778,7 @@ static void GetOpenCLBuiltinFctOverloads(
     std::vector<QualType> &FunctionList, SmallVector<QualType, 1> &RetTypes,
     SmallVector<SmallVector<QualType, 1>, 5> &ArgTypes) {
   FunctionProtoType::ExtProtoInfo PI(
-      Context.getDefaultCallingConvention(false, false, true));
+      Context.getTargetInfo().getDefaultCallingConv());
   PI.Variadic = false;
 
   // Do not attempt to create any FunctionTypes if there are no return types,
@@ -4559,15 +4560,14 @@ static void getNestedNameSpecifierIdentifiers(
     II = NNS->getAsIdentifier();
     break;
 
-  case NestedNameSpecifier::Namespace:
-    if (NNS->getAsNamespace()->isAnonymousNamespace())
+  case NestedNameSpecifier::Namespace: {
+    const NamespaceBaseDecl *Namespace = NNS->getAsNamespace();
+    if (const auto *NS = dyn_cast<NamespaceDecl>(Namespace);
+        NS && NS->isAnonymousNamespace())
       return;
-    II = NNS->getAsNamespace()->getIdentifier();
-    break;
-
-  case NestedNameSpecifier::NamespaceAlias:
-    II = NNS->getAsNamespaceAlias()->getIdentifier();
+    II = Namespace->getIdentifier();
     break;
+  }
 
   case NestedNameSpecifier::TypeSpec:
     II = QualType(NNS->getAsType(), 0).getBaseTypeIdentifier();
diff --git a/clang/lib/Sema/SemaM68k.cpp b/clang/lib/Sema/SemaM68k.cpp
index f091827092f83..7a4fcbdb99a99 100644
--- a/clang/lib/Sema/SemaM68k.cpp
+++ b/clang/lib/Sema/SemaM68k.cpp
@@ -32,7 +32,7 @@ void SemaM68k::handleInterruptAttr(Decl *D, const ParsedAttr &AL) {
 
   // FIXME: Check for decl - it should be void ()(void).
 
-  Expr *NumParamsExpr = static_cast<Expr *>(AL.getArgAsExpr(0));
+  Expr *NumParamsExpr = AL.getArgAsExpr(0);
   auto MaybeNumParams = NumParamsExpr->getIntegerConstantExpr(getASTContext());
   if (!MaybeNumParams) {
     Diag(AL.getLoc(), diag::err_attribute_argument_type)
diff --git a/clang/lib/Sema/SemaMSP430.cpp b/clang/lib/Sema/SemaMSP430.cpp
index 5bf931e388f03..3c611f7a27e23 100644
--- a/clang/lib/Sema/SemaMSP430.cpp
+++ b/clang/lib/Sema/SemaMSP430.cpp
@@ -53,7 +53,7 @@ void SemaMSP430::handleInterruptAttr(Decl *D, const ParsedAttr &AL) {
     return;
   }
 
-  Expr *NumParamsExpr = static_cast<Expr *>(AL.getArgAsExpr(0));
+  Expr *NumParamsExpr = AL.getArgAsExpr(0);
   std::optional<llvm::APSInt> NumParams = llvm::APSInt(32);
   if (!(NumParams = NumParamsExpr->getIntegerConstantExpr(getASTContext()))) {
     Diag(AL.getLoc(), diag::err_attribute_argument_type)
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index 9d5106805892a..9dbb1d28aa722 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -1298,6 +1298,15 @@ Decl *SemaObjC::ActOnPropertyImplDecl(
         }
       }
 
+      if (Context.getLangOpts().PointerAuthObjcInterfaceSel &&
+          !PropertyIvarType.getPointerAuth()) {
+        if (Context.isObjCSelType(QualType(PropertyIvarType.getTypePtr(), 0))) {
+          if (auto PAQ = Context.getObjCMemberSelTypePtrAuth())
+            PropertyIvarType =
+                Context.getPointerAuthType(PropertyIvarType, PAQ);
+        }
+      }
+
       Ivar = ObjCIvarDecl::Create(Context, ClassImpDecl,
                                   PropertyIvarLoc,PropertyIvarLoc, PropertyIvar,
                                   PropertyIvarType, /*TInfo=*/nullptr,
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 46aa7dd0dcc21..128a5db57bf73 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -624,6 +624,66 @@ void SemaOpenACC::CheckDeclReference(SourceLocation Loc, Expr *E, Decl *D) {
   // loop (or we aren't in a loop!) so skip the diagnostic.
 }
 
+namespace {
+// Check whether the type of the thing we are referencing is OK for things like
+// private, firstprivate, and reduction, which require certain operators to be
+// available.
+ExprResult CheckVarType(SemaOpenACC &S, OpenACCClauseKind CK, Expr *VarExpr,
+                        Expr *InnerExpr) {
+  // There is nothing to do here, only these three have these sorts of
+  // restrictions.
+  if (CK != OpenACCClauseKind::Private &&
+      CK != OpenACCClauseKind::FirstPrivate &&
+      CK != OpenACCClauseKind::Reduction)
+    return VarExpr;
+
+  // We can't test this if it isn't here, or if the type isn't clear yet.
+  if (!InnerExpr || InnerExpr->isTypeDependent())
+    return VarExpr;
+
+  const auto *RD = InnerExpr->getType()->getAsCXXRecordDecl();
+
+  // if this isn't a C++ record decl, we can create/copy/destroy this thing at
+  // will without problem, so this is a success.
+  if (!RD)
+    return VarExpr;
+
+  // TODO: OpenACC:
+  // Private must have default ctor + dtor in InnerExpr
+  // FirstPrivate must have copyctor + dtor in InnerExpr
+  // Reduction must have copyctor + dtor + operation in InnerExpr
+
+  // TODO OpenACC: It isn't clear what the requirements are for default
+  // constructor/copy constructor are for First private and reduction, but
+  // private requires a default constructor.
+  if (CK == OpenACCClauseKind::Private) {
+    bool HasNonDeletedDefaultCtor =
+        llvm::find_if(RD->ctors(), [](const CXXConstructorDecl *CD) {
+          return CD->isDefaultConstructor() && !CD->isDeleted();
+        }) != RD->ctors().end();
+    if (!HasNonDeletedDefaultCtor && !RD->needsImplicitDefaultConstructor()) {
+      S.Diag(InnerExpr->getBeginLoc(),
+             clang::diag::warn_acc_var_referenced_lacks_op)
+          << InnerExpr->getType() << CK
+          << clang::diag::AccVarReferencedReason::DefCtor;
+      return ExprError();
+    }
+  }
+
+  // All 3 things need to make sure they have a dtor.
+  bool DestructorDeleted =
+      RD->getDestructor() && RD->getDestructor()->isDeleted();
+  if (DestructorDeleted && !RD->needsImplicitDestructor()) {
+    S.Diag(InnerExpr->getBeginLoc(),
+           clang::diag::warn_acc_var_referenced_lacks_op)
+        << InnerExpr->getType() << CK
+        << clang::diag::AccVarReferencedReason::Dtor;
+    return ExprError();
+  }
+  return VarExpr;
+}
+} // namespace
+
 ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
                                  Expr *VarExpr) {
   // This has unique enough restrictions that we should split it to a separate
@@ -660,7 +720,7 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
   if (const auto *DRE = dyn_cast<DeclRefExpr>(CurVarExpr)) {
     if (isa<VarDecl, NonTypeTemplateParmDecl>(
             DRE->getFoundDecl()->getCanonicalDecl()))
-      return VarExpr;
+      return CheckVarType(*this, CK, VarExpr, CurVarExpr);
   }
 
   // If CK is a Reduction, this special cases for OpenACC3.3 2.5.15: "A var in a
@@ -679,9 +739,9 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
         // declare, reduction, and use_device.
         const auto *This = dyn_cast<CXXThisExpr>(ME->getBase());
         if (This && This->isImplicit())
-          return VarExpr;
+          return CheckVarType(*this, CK, VarExpr, CurVarExpr);
       } else {
-        return VarExpr;
+        return CheckVarType(*this, CK, VarExpr, CurVarExpr);
       }
     }
   }
@@ -690,14 +750,14 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
   // doesn't fall into 'variable or array name'
   if (CK != OpenACCClauseKind::UseDevice &&
       DK != OpenACCDirectiveKind::Declare && isa<CXXThisExpr>(CurVarExpr))
-    return VarExpr;
+    return CheckVarType(*this, CK, VarExpr, CurVarExpr);
 
   // Nothing really we can do here, as these are dependent.  So just return they
   // are valid.
   if (isa<DependentScopeDeclRefExpr>(CurVarExpr) ||
       (CK != OpenACCClauseKind::Reduction &&
        isa<CXXDependentScopeMemberExpr>(CurVarExpr)))
-    return VarExpr;
+    return CheckVarType(*this, CK, VarExpr, CurVarExpr);
 
   // There isn't really anything we can do in the case of a recovery expr, so
   // skip the diagnostic rather than produce a confusing diagnostic.
diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index cc110e1115ed5..43f7992906c54 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -417,7 +417,7 @@ void RISCVIntrinsicManagerImpl::CreateRVVIntrinsicDecl(LookupResult &LR,
     ArgTypes.push_back(RVVType2Qual(Context, Sigs[i]));
 
   FunctionProtoType::ExtProtoInfo PI(
-      Context.getDefaultCallingConvention(false, false, true));
+      Context.getTargetInfo().getDefaultCallingConv());
 
   PI.Variadic = false;
 
diff --git a/clang/lib/Sema/SemaSPIRV.cpp b/clang/lib/Sema/SemaSPIRV.cpp
index 76d3cff908b37..c8ea0d09c4081 100644
--- a/clang/lib/Sema/SemaSPIRV.cpp
+++ b/clang/lib/Sema/SemaSPIRV.cpp
@@ -46,6 +46,49 @@ static bool CheckAllArgsHaveSameType(Sema *S, CallExpr *TheCall) {
   return false;
 }
 
+static bool CheckAllArgTypesAreCorrect(
+    Sema *S, CallExpr *TheCall,
+    llvm::ArrayRef<
+        llvm::function_ref<bool(Sema *, SourceLocation, int, QualType)>>
+        Checks) {
+  unsigned NumArgs = TheCall->getNumArgs();
+  assert(Checks.size() == NumArgs &&
+         "Wrong number of checks for Number of args.");
+  // Apply each check to the corresponding argument
+  for (unsigned I = 0; I < NumArgs; ++I) {
+    Expr *Arg = TheCall->getArg(I);
+    if (Checks[I](S, Arg->getBeginLoc(), I + 1, Arg->getType()))
+      return true;
+  }
+  return false;
+}
+
+static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc,
+                                           int ArgOrdinal,
+                                           clang::QualType PassedType) {
+  clang::QualType BaseType =
+      PassedType->isVectorType()
+          ? PassedType->castAs<clang::VectorType>()->getElementType()
+          : PassedType;
+  if (!BaseType->isHalfType() && !BaseType->isFloat16Type() &&
+      !BaseType->isFloat32Type())
+    return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
+           << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0
+           << /* half or float */ 2 << PassedType;
+  return false;
+}
+
+static bool CheckFloatOrHalfScalarRepresentation(Sema *S, SourceLocation Loc,
+                                                 int ArgOrdinal,
+                                                 clang::QualType PassedType) {
+  if (!PassedType->isHalfType() && !PassedType->isFloat16Type() &&
+      !PassedType->isFloat32Type())
+    return S->Diag(Loc, diag::err_builtin_invalid_arg_type)
+           << ArgOrdinal << /* scalar */ 1 << /* no int */ 0
+           << /* half or float */ 2 << PassedType;
+  return false;
+}
+
 static std::optional<int>
 processConstant32BitIntArgument(Sema &SemaRef, CallExpr *Call, int Argument) {
   ExprResult Arg =
@@ -235,6 +278,43 @@ bool SemaSPIRV::CheckSPIRVBuiltinFunctionCall(const TargetInfo &TI,
     TheCall->setType(RetTy);
     break;
   }
+  case SPIRV::BI__builtin_spirv_refract: {
+    if (SemaRef.checkArgCount(TheCall, 3))
+      return true;
+
+    llvm::function_ref<bool(Sema *, SourceLocation, int, QualType)>
+        ChecksArr[] = {CheckFloatOrHalfRepresentation,
+                       CheckFloatOrHalfRepresentation,
+                       CheckFloatOrHalfScalarRepresentation};
+    if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
+                                   llvm::ArrayRef(ChecksArr)))
+      return true;
+    // Check that first two arguments are vectors/scalars of the same type
+    QualType Arg0Type = TheCall->getArg(0)->getType();
+    if (!SemaRef.getASTContext().hasSameUnqualifiedType(
+            Arg0Type, TheCall->getArg(1)->getType()))
+      return SemaRef.Diag(TheCall->getBeginLoc(),
+                          diag::err_vec_builtin_incompatible_vector)
+             << TheCall->getDirectCallee() << /* first two */ 0
+             << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                            TheCall->getArg(1)->getEndLoc());
+
+    // Check that scalar type of 3rd arg is same as base type of first two args
+    clang::QualType BaseType =
+        Arg0Type->isVectorType()
+            ? Arg0Type->castAs<clang::VectorType>()->getElementType()
+            : Arg0Type;
+    if (!SemaRef.getASTContext().hasSameUnqualifiedType(
+            BaseType, TheCall->getArg(2)->getType()))
+      return SemaRef.Diag(TheCall->getBeginLoc(),
+                          diag::err_hlsl_builtin_scalar_vector_mismatch)
+             << /* all */ 0 << TheCall->getDirectCallee() << Arg0Type
+             << TheCall->getArg(2)->getType();
+
+    QualType RetTy = TheCall->getArg(0)->getType();
+    TheCall->setType(RetTy);
+    break;
+  }
   case SPIRV::BI__builtin_spirv_smoothstep: {
     if (SemaRef.checkArgCount(TheCall, 3))
       return true;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index b76619fc50268..698d1270be634 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -6299,7 +6299,6 @@ bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier(
   switch (NNS->getKind()) {
   case NestedNameSpecifier::Identifier:
   case NestedNameSpecifier::Namespace:
-  case NestedNameSpecifier::NamespaceAlias:
   case NestedNameSpecifier::Global:
   case NestedNameSpecifier::Super:
     return false;
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index d09a72b71b805..e1a975bcfb3e1 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3083,8 +3083,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
 
     // If there was no default argument, deduction is incomplete.
     if (DefArg.getArgument().isNull()) {
-      Info.Param = makeTemplateParameter(
-          const_cast<NamedDecl *>(TemplateParams->getParam(I)));
+      Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       Info.reset(
           TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted),
           TemplateArgumentList::CreateCopy(S.Context, CTAI.CanonicalConverted));
@@ -3100,8 +3099,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
     if (S.CheckTemplateArgument(
             Param, DefArg, TD, TD->getLocation(), TD->getSourceRange().getEnd(),
             /*ArgumentPackIndex=*/0, CTAI, Sema::CTAK_Specified)) {
-      Info.Param = makeTemplateParameter(
-                         const_cast<NamedDecl *>(TemplateParams->getParam(I)));
+      Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
       // FIXME: These template arguments are temporary. Free them!
       Info.reset(
           TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted),
@@ -3227,7 +3225,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
     if (ParamIdx >= TPL->size())
       ParamIdx = TPL->size() - 1;
 
-    Decl *Param = const_cast<NamedDecl *>(TPL->getParam(ParamIdx));
+    Decl *Param = TPL->getParam(ParamIdx);
     Info.Param = makeTemplateParameter(Param);
     Info.FirstArg = Ps[ArgIdx].getArgument();
     return TemplateDeductionResult::SubstitutionFailure;
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 661746731fdcc..7dbd4bb0ed125 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -4860,7 +4860,9 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
             S.Diag(First->getBeginLoc(),
                    diag::err_explicit_object_parameter_invalid)
                 << First->getSourceRange();
-
+          // Do let non-member function have explicit parameters
+          // to not break assumptions elsewhere in the code.
+          First->setExplicitObjectParameterLoc(SourceLocation());
           D.setInvalidType();
           AreDeclaratorChunksValid = false;
         }
@@ -6531,7 +6533,7 @@ static void HandleAddressSpaceTypeAttribute(QualType &Type,
       return;
     }
 
-    Expr *ASArgExpr = static_cast<Expr *>(Attr.getArgAsExpr(0));
+    Expr *ASArgExpr = Attr.getArgAsExpr(0);
     LangAS ASIdx;
     if (!BuildAddressSpaceIndex(S, ASIdx, ASArgExpr, Attr.getLoc())) {
       Attr.setInvalid();
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 758012f894a41..286c2b486c0f9 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4621,22 +4621,12 @@ NestedNameSpecifierLoc TreeTransform<Derived>::TransformNestedNameSpecifierLoc(
     }
 
     case NestedNameSpecifier::Namespace: {
-      NamespaceDecl *NS =
-          cast_or_null<NamespaceDecl>(getDerived().TransformDecl(
-              Q.getLocalBeginLoc(), QNNS->getAsNamespace()));
+      auto *NS = cast<NamespaceBaseDecl>(getDerived().TransformDecl(
+          Q.getLocalBeginLoc(), QNNS->getAsNamespace()));
       SS.Extend(SemaRef.Context, NS, Q.getLocalBeginLoc(), Q.getLocalEndLoc());
       break;
     }
 
-    case NestedNameSpecifier::NamespaceAlias: {
-      NamespaceAliasDecl *Alias =
-          cast_or_null<NamespaceAliasDecl>(getDerived().TransformDecl(
-              Q.getLocalBeginLoc(), QNNS->getAsNamespaceAlias()));
-      SS.Extend(SemaRef.Context, Alias, Q.getLocalBeginLoc(),
-                Q.getLocalEndLoc());
-      break;
-    }
-
     case NestedNameSpecifier::Global:
       // There is no meaningful transformation that one could perform on the
       // global scope.
@@ -14034,9 +14024,14 @@ TreeTransform<Derived>::TransformCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
     if (Object.isInvalid())
       return ExprError();
 
-    // FIXME: Poor location information
-    SourceLocation FakeLParenLoc = SemaRef.getLocForEndOfToken(
-        static_cast<Expr *>(Object.get())->getEndLoc());
+    // FIXME: Poor location information. Also, if the location for the end of
+    // the token is within a macro expansion, getLocForEndOfToken() will return
+    // an invalid source location. If that happens and we have an otherwise
+    // valid end location, use the valid one instead of the invalid one.
+    SourceLocation EndLoc = static_cast<Expr *>(Object.get())->getEndLoc();
+    SourceLocation FakeLParenLoc = SemaRef.getLocForEndOfToken(EndLoc);
+    if (FakeLParenLoc.isInvalid() && EndLoc.isValid())
+      FakeLParenLoc = EndLoc;
 
     // Transform the call arguments.
     SmallVector<Expr*, 8> Args;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 30e0973149594..3596d2240167e 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -175,6 +175,15 @@ bool ChainedASTReaderListener::ReadLanguageOptions(
                                      AllowCompatibleDifferences);
 }
 
+bool ChainedASTReaderListener::ReadCodeGenOptions(
+    const CodeGenOptions &CGOpts, StringRef ModuleFilename, bool Complain,
+    bool AllowCompatibleDifferences) {
+  return First->ReadCodeGenOptions(CGOpts, ModuleFilename, Complain,
+                                   AllowCompatibleDifferences) ||
+         Second->ReadCodeGenOptions(CGOpts, ModuleFilename, Complain,
+                                    AllowCompatibleDifferences);
+}
+
 bool ChainedASTReaderListener::ReadTargetOptions(
     const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
     bool AllowCompatibleDifferences) {
@@ -383,6 +392,68 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
   return false;
 }
 
+static bool checkCodegenOptions(const CodeGenOptions &CGOpts,
+                                const CodeGenOptions &ExistingCGOpts,
+                                StringRef ModuleFilename,
+                                DiagnosticsEngine *Diags,
+                                bool AllowCompatibleDifferences = true) {
+  // FIXME: Specify and print a description for each option instead of the name.
+  // FIXME: Replace with C++20 `using enum CodeGenOptions::CompatibilityKind`.
+  using CK = CodeGenOptions::CompatibilityKind;
+#define CODEGENOPT(Name, Bits, Default, Compatibility)                         \
+  if constexpr (CK::Compatibility != CK::Benign) {                             \
+    if ((CK::Compatibility == CK::NotCompatible) ||                            \
+        (CK::Compatibility == CK::Compatible &&                                \
+         !AllowCompatibleDifferences)) {                                       \
+      if (ExistingCGOpts.Name != CGOpts.Name) {                                \
+        if (Diags) {                                                           \
+          if (Bits == 1)                                                       \
+            Diags->Report(diag::err_ast_file_codegenopt_mismatch)              \
+                << #Name << CGOpts.Name << ExistingCGOpts.Name                 \
+                << ModuleFilename;                                             \
+          else                                                                 \
+            Diags->Report(diag::err_ast_file_codegenopt_value_mismatch)        \
+                << #Name << ModuleFilename;                                    \
+        }                                                                      \
+        return true;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define VALUE_CODEGENOPT(Name, Bits, Default, Compatibility)                   \
+  if constexpr (CK::Compatibility != CK::Benign) {                             \
+    if ((CK::Compatibility == CK::NotCompatible) ||                            \
+        (CK::Compatibility == CK::Compatible &&                                \
+         !AllowCompatibleDifferences)) {                                       \
+      if (ExistingCGOpts.Name != CGOpts.Name) {                                \
+        if (Diags)                                                             \
+          Diags->Report(diag::err_ast_file_codegenopt_value_mismatch)          \
+              << #Name << ModuleFilename;                                      \
+        return true;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+#define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)              \
+  if constexpr (CK::Compatibility != CK::Benign) {                             \
+    if ((CK::Compatibility == CK::NotCompatible) ||                            \
+        (CK::Compatibility == CK::Compatible &&                                \
+         !AllowCompatibleDifferences)) {                                       \
+      if (ExistingCGOpts.get##Name() != CGOpts.get##Name()) {                  \
+        if (Diags)                                                             \
+          Diags->Report(diag::err_ast_file_codegenopt_value_mismatch)          \
+              << #Name << ModuleFilename;                                      \
+        return true;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+#define DEBUGOPT(Name, Bits, Default, Compatibility)
+#define VALUE_DEBUGOPT(Name, Bits, Default, Compatibility)
+#define ENUM_DEBUGOPT(Name, Type, Bits, Default, Compatibility)
+#include "clang/Basic/CodeGenOptions.def"
+
+  return false;
+}
+
 /// Compare the given set of target options against an existing set of
 /// target options.
 ///
@@ -462,6 +533,15 @@ bool PCHValidator::ReadLanguageOptions(const LangOptions &LangOpts,
                               AllowCompatibleDifferences);
 }
 
+bool PCHValidator::ReadCodeGenOptions(const CodeGenOptions &CGOpts,
+                                      StringRef ModuleFilename, bool Complain,
+                                      bool AllowCompatibleDifferences) {
+  const CodeGenOptions &ExistingCGOpts = Reader.getCodeGenOpts();
+  return checkCodegenOptions(ExistingCGOpts, CGOpts, ModuleFilename,
+                             Complain ? &Reader.Diags : nullptr,
+                             AllowCompatibleDifferences);
+}
+
 bool PCHValidator::ReadTargetOptions(const TargetOptions &TargetOpts,
                                      StringRef ModuleFilename, bool Complain,
                                      bool AllowCompatibleDifferences) {
@@ -2993,6 +3073,14 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
       break;
     }
 
+    case CODEGEN_OPTIONS: {
+      bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
+      if (ParseCodeGenOptions(Record, Filename, Complain, Listener,
+                              AllowCompatibleConfigurationMismatch))
+        Result = ConfigurationMismatch;
+      break;
+    }
+
     case TARGET_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
       if (ParseTargetOptions(Record, Filename, Complain, Listener,
@@ -5639,6 +5727,7 @@ namespace {
 
   class SimplePCHValidator : public ASTReaderListener {
     const LangOptions &ExistingLangOpts;
+    const CodeGenOptions &ExistingCGOpts;
     const TargetOptions &ExistingTargetOpts;
     const PreprocessorOptions &ExistingPPOpts;
     std::string ExistingModuleCachePath;
@@ -5647,11 +5736,12 @@ namespace {
 
   public:
     SimplePCHValidator(const LangOptions &ExistingLangOpts,
+                       const CodeGenOptions &ExistingCGOpts,
                        const TargetOptions &ExistingTargetOpts,
                        const PreprocessorOptions &ExistingPPOpts,
                        StringRef ExistingModuleCachePath, FileManager &FileMgr,
                        bool StrictOptionMatches)
-        : ExistingLangOpts(ExistingLangOpts),
+        : ExistingLangOpts(ExistingLangOpts), ExistingCGOpts(ExistingCGOpts),
           ExistingTargetOpts(ExistingTargetOpts),
           ExistingPPOpts(ExistingPPOpts),
           ExistingModuleCachePath(ExistingModuleCachePath), FileMgr(FileMgr),
@@ -5664,6 +5754,13 @@ namespace {
                                   nullptr, AllowCompatibleDifferences);
     }
 
+    bool ReadCodeGenOptions(const CodeGenOptions &CGOpts,
+                            StringRef ModuleFilename, bool Complain,
+                            bool AllowCompatibleDifferences) override {
+      return checkCodegenOptions(ExistingCGOpts, CGOpts, ModuleFilename,
+                                 nullptr, AllowCompatibleDifferences);
+    }
+
     bool ReadTargetOptions(const TargetOptions &TargetOpts,
                            StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
@@ -6012,9 +6109,10 @@ bool ASTReader::readASTFileControlBlock(
 bool ASTReader::isAcceptableASTFile(
     StringRef Filename, FileManager &FileMgr, const ModuleCache &ModCache,
     const PCHContainerReader &PCHContainerRdr, const LangOptions &LangOpts,
-    const TargetOptions &TargetOpts, const PreprocessorOptions &PPOpts,
-    StringRef ExistingModuleCachePath, bool RequireStrictOptionMatches) {
-  SimplePCHValidator validator(LangOpts, TargetOpts, PPOpts,
+    const CodeGenOptions &CGOpts, const TargetOptions &TargetOpts,
+    const PreprocessorOptions &PPOpts, StringRef ExistingModuleCachePath,
+    bool RequireStrictOptionMatches) {
+  SimplePCHValidator validator(LangOpts, CGOpts, TargetOpts, PPOpts,
                                ExistingModuleCachePath, FileMgr,
                                RequireStrictOptionMatches);
   return !readASTFileControlBlock(Filename, FileMgr, ModCache, PCHContainerRdr,
@@ -6395,6 +6493,28 @@ bool ASTReader::ParseLanguageOptions(const RecordData &Record,
                                       AllowCompatibleDifferences);
 }
 
+bool ASTReader::ParseCodeGenOptions(const RecordData &Record,
+                                    StringRef ModuleFilename, bool Complain,
+                                    ASTReaderListener &Listener,
+                                    bool AllowCompatibleDifferences) {
+  unsigned Idx = 0;
+  CodeGenOptions CGOpts;
+  using CK = CodeGenOptions::CompatibilityKind;
+#define CODEGENOPT(Name, Bits, Default, Compatibility)                         \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
+    CGOpts.Name = static_cast<unsigned>(Record[Idx++]);
+#define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)              \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
+    CGOpts.set##Name(static_cast<clang::CodeGenOptions::Type>(Record[Idx++]));
+#define DEBUGOPT(Name, Bits, Default, Compatibility)
+#define VALUE_DEBUGOPT(Name, Bits, Default, Compatibility)
+#define ENUM_DEBUGOPT(Name, Type, Bits, Default, Compatibility)
+#include "clang/Basic/CodeGenOptions.def"
+
+  return Listener.ReadCodeGenOptions(CGOpts, ModuleFilename, Complain,
+                                     AllowCompatibleDifferences);
+}
+
 bool ASTReader::ParseTargetOptions(const RecordData &Record,
                                    StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
@@ -9987,19 +10107,12 @@ ASTRecordReader::readNestedNameSpecifierLoc() {
     }
 
     case NestedNameSpecifier::Namespace: {
-      NamespaceDecl *NS = readDeclAs<NamespaceDecl>();
+      auto *NS = readDeclAs<NamespaceBaseDecl>();
       SourceRange Range = readSourceRange();
       Builder.Extend(Context, NS, Range.getBegin(), Range.getEnd());
       break;
     }
 
-    case NestedNameSpecifier::NamespaceAlias: {
-      NamespaceAliasDecl *Alias = readDeclAs<NamespaceAliasDecl>();
-      SourceRange Range = readSourceRange();
-      Builder.Extend(Context, Alias, Range.getBegin(), Range.getEnd());
-      break;
-    }
-
     case NestedNameSpecifier::TypeSpec: {
       TypeSourceInfo *T = readTypeSourceInfo();
       if (!T)
@@ -10985,6 +11098,7 @@ void ASTReader::pushExternalDeclIntoScope(NamedDecl *D, DeclarationName Name) {
 ASTReader::ASTReader(Preprocessor &PP, ModuleCache &ModCache,
                      ASTContext *Context,
                      const PCHContainerReader &PCHContainerRdr,
+                     const CodeGenOptions &CodeGenOpts,
                      ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
                      StringRef isysroot,
                      DisableValidationForModuleKind DisableValidationKind,
@@ -10999,6 +11113,7 @@ ASTReader::ASTReader(Preprocessor &PP, ModuleCache &ModCache,
       SourceMgr(PP.getSourceManager()), FileMgr(PP.getFileManager()),
       PCHContainerRdr(PCHContainerRdr), Diags(PP.getDiagnostics()),
       StackHandler(Diags), PP(PP), ContextObj(Context),
+      CodeGenOpts(CodeGenOpts),
       ModuleMgr(PP.getFileManager(), ModCache, PCHContainerRdr,
                 PP.getHeaderSearchInfo()),
       DummyIdResolver(PP), ReadTimer(std::move(ReadTimer)), isysroot(isysroot),
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index b918bfbd549c3..bd84a9741d01b 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1889,7 +1889,7 @@ void ASTDeclReader::VisitNamespaceAliasDecl(NamespaceAliasDecl *D) {
   D->NamespaceLoc = readSourceLocation();
   D->IdentLoc = readSourceLocation();
   D->QualifierLoc = Record.readNestedNameSpecifierLoc();
-  D->Namespace = readDeclAs<NamedDecl>();
+  D->Namespace = readDeclAs<NamespaceBaseDecl>();
   mergeRedeclarable(D, Redecl);
 }
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 1a20fc9595dce..e868afeb1a145 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -898,6 +898,7 @@ void ASTWriter::WriteBlockInfoBlock() {
 
   BLOCK(OPTIONS_BLOCK);
   RECORD(LANGUAGE_OPTIONS);
+  RECORD(CODEGEN_OPTIONS);
   RECORD(TARGET_OPTIONS);
   RECORD(FILE_SYSTEM_OPTIONS);
   RECORD(HEADER_SEARCH_OPTIONS);
@@ -1646,6 +1647,23 @@ void ASTWriter::WriteControlBlock(Preprocessor &PP, StringRef isysroot) {
 
   Stream.EmitRecord(LANGUAGE_OPTIONS, Record);
 
+  // Codegen options.
+  // FIXME: Replace with C++20 `using enum CodeGenOptions::CompatibilityKind`.
+  using CK = CodeGenOptions::CompatibilityKind;
+  Record.clear();
+  const CodeGenOptions &CGOpts = getCodeGenOpts();
+#define CODEGENOPT(Name, Bits, Default, Compatibility)                         \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
+    Record.push_back(static_cast<unsigned>(CGOpts.Name));
+#define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)              \
+  if constexpr (CK::Compatibility != CK::Benign)                               \
+    Record.push_back(static_cast<unsigned>(CGOpts.get##Name()));
+#define DEBUGOPT(Name, Bits, Default, Compatibility)
+#define VALUE_DEBUGOPT(Name, Bits, Default, Compatibility)
+#define ENUM_DEBUGOPT(Name, Type, Bits, Default, Compatibility)
+#include "clang/Basic/CodeGenOptions.def"
+  Stream.EmitRecord(CODEGEN_OPTIONS, Record);
+
   // Target options.
   Record.clear();
   const TargetInfo &Target = PP.getTargetInfo();
@@ -5384,11 +5402,12 @@ void ASTWriter::SetSelectorOffset(Selector Sel, uint32_t Offset) {
 
 ASTWriter::ASTWriter(llvm::BitstreamWriter &Stream,
                      SmallVectorImpl<char> &Buffer, ModuleCache &ModCache,
+                     const CodeGenOptions &CodeGenOpts,
                      ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
                      bool IncludeTimestamps, bool BuildingImplicitModule,
                      bool GeneratingReducedBMI)
     : Stream(Stream), Buffer(Buffer), ModCache(ModCache),
-      IncludeTimestamps(IncludeTimestamps),
+      CodeGenOpts(CodeGenOpts), IncludeTimestamps(IncludeTimestamps),
       BuildingImplicitModule(BuildingImplicitModule),
       GeneratingReducedBMI(GeneratingReducedBMI) {
   for (const auto &Ext : Extensions) {
@@ -7074,11 +7093,6 @@ void ASTRecordWriter::AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) {
       AddSourceRange(NNS.getLocalSourceRange());
       break;
 
-    case NestedNameSpecifier::NamespaceAlias:
-      AddDeclRef(NNS.getNestedNameSpecifier()->getAsNamespaceAlias());
-      AddSourceRange(NNS.getLocalSourceRange());
-      break;
-
     case NestedNameSpecifier::TypeSpec:
       AddTypeRef(NNS.getTypeLoc().getType());
       AddTypeLoc(NNS.getTypeLoc());
diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp
index 77317f0a1db32..f8be0e45078db 100644
--- a/clang/lib/Serialization/GeneratePCH.cpp
+++ b/clang/lib/Serialization/GeneratePCH.cpp
@@ -25,13 +25,14 @@ using namespace clang;
 PCHGenerator::PCHGenerator(
     Preprocessor &PP, ModuleCache &ModCache, StringRef OutputFile,
     StringRef isysroot, std::shared_ptr<PCHBuffer> Buffer,
+    const CodeGenOptions &CodeGenOpts,
     ArrayRef<std::shared_ptr<ModuleFileExtension>> Extensions,
     bool AllowASTWithErrors, bool IncludeTimestamps,
     bool BuildingImplicitModule, bool ShouldCacheASTInMemory,
     bool GeneratingReducedBMI)
     : PP(PP), Subject(&PP), OutputFile(OutputFile), isysroot(isysroot.str()),
       Buffer(std::move(Buffer)), Stream(this->Buffer->Data),
-      Writer(Stream, this->Buffer->Data, ModCache, Extensions,
+      Writer(Stream, this->Buffer->Data, ModCache, CodeGenOpts, Extensions,
              IncludeTimestamps, BuildingImplicitModule, GeneratingReducedBMI),
       AllowASTWithErrors(AllowASTWithErrors),
       ShouldCacheASTInMemory(ShouldCacheASTInMemory) {
@@ -102,11 +103,12 @@ void PCHGenerator::anchor() {}
 CXX20ModulesGenerator::CXX20ModulesGenerator(Preprocessor &PP,
                                              ModuleCache &ModCache,
                                              StringRef OutputFile,
+                                             const CodeGenOptions &CodeGenOpts,
                                              bool GeneratingReducedBMI,
                                              bool AllowASTWithErrors)
     : PCHGenerator(
           PP, ModCache, OutputFile, llvm::StringRef(),
-          std::make_shared<PCHBuffer>(),
+          std::make_shared<PCHBuffer>(), CodeGenOpts,
           /*Extensions=*/ArrayRef<std::shared_ptr<ModuleFileExtension>>(),
           AllowASTWithErrors, /*IncludeTimestamps=*/false,
           /*BuildingImplicitModule=*/false, /*ShouldCacheASTInMemory=*/false,
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index cfcc47c5e71c8..478bd85177143 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -177,7 +177,10 @@ bool tryToFindPtrOrigin(
       E = unaryOp->getSubExpr();
       continue;
     }
-
+    if (auto *BoxedExpr = dyn_cast<ObjCBoxedExpr>(E)) {
+      E = BoxedExpr->getSubExpr();
+      continue;
+    }
     break;
   }
   // Some other expression.
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index c77ef26da568d..d87484470f8b5 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1941,7 +1941,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::ConceptSpecializationExprClass:
     case Stmt::CXXRewrittenBinaryOperatorClass:
     case Stmt::RequiresExprClass:
-    case Expr::CXXParenListInitExprClass:
     case Stmt::EmbedExprClass:
       // Fall through.
 
@@ -2315,11 +2314,22 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       break;
     }
 
-    case Stmt::InitListExprClass:
+    case Stmt::InitListExprClass: {
+      const InitListExpr *E = cast<InitListExpr>(S);
       Bldr.takeNodes(Pred);
-      VisitInitListExpr(cast<InitListExpr>(S), Pred, Dst);
+      ConstructInitList(E, E->inits(), E->isTransparent(), Pred, Dst);
       Bldr.addNodes(Dst);
       break;
+    }
+
+    case Expr::CXXParenListInitExprClass: {
+      const CXXParenListInitExpr *E = cast<CXXParenListInitExpr>(S);
+      Bldr.takeNodes(Pred);
+      ConstructInitList(E, E->getInitExprs(), /*IsTransparent*/ false, Pred,
+                        Dst);
+      Bldr.addNodes(Dst);
+      break;
+    }
 
     case Stmt::MemberExprClass:
       Bldr.takeNodes(Pred);
@@ -4114,3 +4124,33 @@ void *ProgramStateTrait<ReplayWithoutInlining>::GDMIndex() {
 }
 
 void ExprEngine::anchor() { }
+
+void ExprEngine::ConstructInitList(const Expr *E, ArrayRef<Expr *> Args,
+                                   bool IsTransparent, ExplodedNode *Pred,
+                                   ExplodedNodeSet &Dst) {
+  assert((isa<InitListExpr, CXXParenListInitExpr>(E)));
+
+  const LocationContext *LC = Pred->getLocationContext();
+
+  StmtNodeBuilder B(Pred, Dst, *currBldrCtx);
+  ProgramStateRef S = Pred->getState();
+  QualType T = E->getType().getCanonicalType();
+
+  bool IsCompound = T->isArrayType() || T->isRecordType() ||
+                    T->isAnyComplexType() || T->isVectorType();
+
+  if (Args.size() > 1 || (E->isPRValue() && IsCompound && !IsTransparent)) {
+    llvm::ImmutableList<SVal> ArgList = getBasicVals().getEmptySValList();
+    for (Expr *E : llvm::reverse(Args))
+      ArgList = getBasicVals().prependSVal(S->getSVal(E, LC), ArgList);
+
+    B.generateNode(E, Pred,
+                   S->BindExpr(E, LC, svalBuilder.makeCompoundVal(T, ArgList)));
+  } else {
+    B.generateNode(E, Pred,
+                   S->BindExpr(E, LC,
+                               Args.size() == 0
+                                   ? getSValBuilder().makeZeroVal(T)
+                                   : S->getSVal(Args.front(), LC)));
+  }
+}
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index fa8e669b6bb2f..f1a25a750dd0d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -771,54 +771,6 @@ void ExprEngine::VisitLogicalExpr(const BinaryOperator* B, ExplodedNode *Pred,
   Bldr.generateNode(B, Pred, state->BindExpr(B, Pred->getLocationContext(), X));
 }
 
-void ExprEngine::VisitInitListExpr(const InitListExpr *IE,
-                                   ExplodedNode *Pred,
-                                   ExplodedNodeSet &Dst) {
-  StmtNodeBuilder B(Pred, Dst, *currBldrCtx);
-
-  ProgramStateRef state = Pred->getState();
-  const LocationContext *LCtx = Pred->getLocationContext();
-  QualType T = getContext().getCanonicalType(IE->getType());
-  unsigned NumInitElements = IE->getNumInits();
-
-  if (!IE->isGLValue() && !IE->isTransparent() &&
-      (T->isArrayType() || T->isRecordType() || T->isVectorType() ||
-       T->isAnyComplexType())) {
-    llvm::ImmutableList<SVal> vals = getBasicVals().getEmptySValList();
-
-    // Handle base case where the initializer has no elements.
-    // e.g: static int* myArray[] = {};
-    if (NumInitElements == 0) {
-      SVal V = svalBuilder.makeCompoundVal(T, vals);
-      B.generateNode(IE, Pred, state->BindExpr(IE, LCtx, V));
-      return;
-    }
-
-    for (const Stmt *S : llvm::reverse(*IE)) {
-      SVal V = state->getSVal(cast<Expr>(S), LCtx);
-      vals = getBasicVals().prependSVal(V, vals);
-    }
-
-    B.generateNode(IE, Pred,
-                   state->BindExpr(IE, LCtx,
-                                   svalBuilder.makeCompoundVal(T, vals)));
-    return;
-  }
-
-  // Handle scalars: int{5} and int{} and GLvalues.
-  // Note, if the InitListExpr is a GLvalue, it means that there is an address
-  // representing it, so it must have a single init element.
-  assert(NumInitElements <= 1);
-
-  SVal V;
-  if (NumInitElements == 0)
-    V = getSValBuilder().makeZeroVal(T);
-  else
-    V = state->getSVal(IE->getInit(0), LCtx);
-
-  B.generateNode(IE, Pred, state->BindExpr(IE, LCtx, V));
-}
-
 void ExprEngine::VisitGuardedExpr(const Expr *Ex,
                                   const Expr *L,
                                   const Expr *R,
diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
index 63ad567ab7151..3e68373028b10 100644
--- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -1022,6 +1022,22 @@ getStackOrCaptureRegionForDeclContext(const LocationContext *LC,
   return (const StackFrameContext *)nullptr;
 }
 
+static bool isStdStreamVar(const VarDecl *D) {
+  const IdentifierInfo *II = D->getIdentifier();
+  if (!II)
+    return false;
+  if (!D->getDeclContext()->isTranslationUnit())
+    return false;
+  StringRef N = II->getName();
+  QualType FILETy = D->getASTContext().getFILEType();
+  if (FILETy.isNull())
+    return false;
+  FILETy = FILETy.getCanonicalType();
+  QualType Ty = D->getType().getCanonicalType();
+  return Ty->isPointerType() && Ty->getPointeeType() == FILETy &&
+         (N == "stdin" || N == "stdout" || N == "stderr");
+}
+
 const VarRegion *MemRegionManager::getVarRegion(const VarDecl *D,
                                                 const LocationContext *LC) {
   const auto *PVD = dyn_cast<ParmVarDecl>(D);
@@ -1054,10 +1070,18 @@ const VarRegion *MemRegionManager::getVarRegion(const VarDecl *D,
     assert(!Ty.isNull());
     if (Ty.isConstQualified()) {
       sReg = getGlobalsRegion(MemRegion::GlobalImmutableSpaceRegionKind);
-    } else if (Ctx.getSourceManager().isInSystemHeader(D->getLocation())) {
-      sReg = getGlobalsRegion(MemRegion::GlobalSystemSpaceRegionKind);
     } else {
-      sReg = getGlobalsRegion(MemRegion::GlobalInternalSpaceRegionKind);
+      // Pointer value of C standard streams is usually not modified by calls
+      // to functions declared in system headers. This means that they should
+      // not get invalidated by calls to functions declared in system headers,
+      // so they are placed in the global internal space, which is not
+      // invalidated by calls to functions declared in system headers.
+      if (Ctx.getSourceManager().isInSystemHeader(D->getLocation()) &&
+          !isStdStreamVar(D)) {
+        sReg = getGlobalsRegion(MemRegion::GlobalSystemSpaceRegionKind);
+      } else {
+        sReg = getGlobalsRegion(MemRegion::GlobalInternalSpaceRegionKind);
+      }
     }
 
   // Finally handle static locals.
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index caac719caf8e8..eb9fa7a7fa1e8 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -950,7 +950,6 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
     case NestedNameSpecifier::Global:
       return syntax::NodeKind::GlobalNameSpecifier;
     case NestedNameSpecifier::Namespace:
-    case NestedNameSpecifier::NamespaceAlias:
     case NestedNameSpecifier::Identifier:
       return syntax::NodeKind::IdentifierNameSpecifier;
     case NestedNameSpecifier::TypeSpec: {
diff --git a/clang/test/AST/ByteCode/complex.cpp b/clang/test/AST/ByteCode/complex.cpp
index 2c0111c53d3bf..959d759005ef4 100644
--- a/clang/test/AST/ByteCode/complex.cpp
+++ b/clang/test/AST/ByteCode/complex.cpp
@@ -396,10 +396,10 @@ namespace ComplexConstexpr {
                                     // both-note {{cannot refer to element 3 of array of 2 elements}}
   constexpr _Complex float *p = 0;
   constexpr float pr = __real *p; // both-error {{constant expr}} \
-                                  // ref-note {{cannot access real component of null}} \
-                                  // expected-note {{read of dereferenced null pointer}}
+                                  // expected-note {{read of dereferenced null pointer}} \
+                                  // ref-note {{dereferencing a null pointer}}
   constexpr float pi = __imag *p; // both-error {{constant expr}} \
-                                  // ref-note {{cannot access imaginary component of null}}
+                                  // ref-note {{dereferencing a null pointer}}
   constexpr const _Complex double *q = &test3 + 1;
   constexpr double qr = __real *q; // ref-error {{constant expr}} \
                                    // ref-note {{cannot access real component of pointer past the end}}
diff --git a/clang/test/AST/ByteCode/const-eval.c b/clang/test/AST/ByteCode/const-eval.c
index eab14c08ec809..c8651a744f969 100644
--- a/clang/test/AST/ByteCode/const-eval.c
+++ b/clang/test/AST/ByteCode/const-eval.c
@@ -51,6 +51,8 @@ struct s {
 };
 
 EVAL_EXPR(19, ((int)&*(char*)10 == 10 ? 1 : -1));
+// ref-error@-1 {{expression is not an integer constant expression}} \
+// ref-note@-1 {{dereferencing a null pointer}}
 
 #ifndef NEW_INTERP
 EVAL_EXPR(20, __builtin_constant_p(*((int*) 10)));
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index b34e7823220e2..55554220b0a8a 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -39,7 +39,9 @@ struct S {
 constexpr S s = { 5 };
 constexpr const int *p = &s.m + 1;
 
-constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok
+constexpr const int *np2 = &(*(int(*)[4])nullptr)[0];
+// ref-error@-1 {{constexpr variable 'np2' must be initialized by a constant expression}} \
+// ref-note@-1  {{dereferencing a null pointer is not allowed in a constant expression}}
 
 constexpr int preDec(int x) { // both-error {{never produces a constant expression}}
   return --x;                 // both-note {{subexpression}}
diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp
index 2856b872d44ab..45dd4f528aefb 100644
--- a/clang/test/AST/ByteCode/cxx23.cpp
+++ b/clang/test/AST/ByteCode/cxx23.cpp
@@ -369,7 +369,26 @@ namespace NestedUnions {
     return true;
   }
   static_assert(test_nested());
+}
+
+namespace UnionMemberCallDiags {
+  struct A { int n; };
+  struct B { A a; };
+  constexpr A a = (A() = B().a);
 
+  union C {
+    int n;
+    A a;
+  };
 
+  constexpr bool g() {
+    C c = {.n = 1};
+    c.a.operator=(B{2}.a); // all-note {{member call on member 'a' of union with active member 'n' is not allowed in a constant expression}}
+    return c.a.n == 2;
+  }
+  static_assert(g()); // all-error {{not an integral constant expression}} \
+                      // all-note {{in call to}}
 }
+
+
 #endif
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index 670def2d5870e..b587cd6eaf89c 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -486,3 +486,11 @@ namespace bitcast {
   }
   static_assert(foo() == 0);
 }
+
+constexpr int modify_const_variable() {
+  const int a = 10;
+  new ((int *)&a) int(12); // both-note {{modification of object of const-qualified type 'const int' is not allowed in a constant expression}}
+  return a;
+}
+static_assert(modify_const_variable()); // both-error {{not an integral constant expression}} \
+                                        // both-note {{in call to}}
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index d369c64bc3904..774fed6189d64 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -413,7 +413,7 @@ namespace DeriveFailures {
 
     constexpr Derived(int i) : OtherVal(i) {} // ref-error {{never produces a constant expression}} \
                                               // both-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} \
-                                              // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} 
+                                              // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}}
   };
 
   constexpr Derived D(12); // both-error {{must be initialized by a constant expression}} \
@@ -1660,9 +1660,11 @@ namespace NullptrCast {
   constexpr A *na = nullptr;
   constexpr B *nb = nullptr;
   constexpr A &ra = *nb; // both-error {{constant expression}} \
-                         // both-note {{cannot access base class of null pointer}}
+                         // ref-note {{dereferencing a null pointer}} \
+                         // expected-note {{cannot access base class of null pointer}}
   constexpr B &rb = (B&)*na; // both-error {{constant expression}} \
-                             // both-note {{cannot access derived class of null pointer}}
+                             // ref-note {{dereferencing a null pointer}} \
+                             // expected-note {{cannot access derived class of null pointer}}
   constexpr bool test() {
     auto a = (A*)(B*)nullptr;
 
@@ -1740,7 +1742,7 @@ namespace CtorOfInvalidClass {
 #if __cplusplus >= 202002L
   template <typename T, auto Q>
   concept ReferenceOf = Q;
-  /// This calls a valid and constexpr copy constructor of InvalidCtor, 
+  /// This calls a valid and constexpr copy constructor of InvalidCtor,
   /// but should still be rejected.
   template<ReferenceOf<InvalidCtor> auto R, typename Rep> int F; // both-error {{non-type template argument is not a constant expression}}
 #endif
diff --git a/clang/test/AST/ByteCode/unions.cpp b/clang/test/AST/ByteCode/unions.cpp
index f97990c1ff849..0fa44a259a4ff 100644
--- a/clang/test/AST/ByteCode/unions.cpp
+++ b/clang/test/AST/ByteCode/unions.cpp
@@ -3,6 +3,9 @@
 // RUN: %clang_cc1            -verify=ref,both      %s
 // RUN: %clang_cc1 -std=c++20 -verify=ref,both      %s
 
+#define assert_active(F)   if (!__builtin_is_within_lifetime(&F)) (1/0);
+#define assert_inactive(F) if ( __builtin_is_within_lifetime(&F)) (1/0);
+
 union U {
   int a;
   int b;
@@ -229,9 +232,24 @@ namespace Nested {
   }
   static_assert(foo2() == 10);
 
- constexpr int foo3() { // both-error {{constexpr function never produces a constant expression}}
+ consteval int foo3() { // both-error {{function never produces a constant expression}}
     U2 u;
+    /// No active field.
+    assert_active(u);
+    assert_inactive(u.u);
+    assert_inactive(u.u2);
+    assert_inactive(u.x);
+    assert_inactive(u.y);
+
     u.u.a = 10;
+    assert_active(u);
+    assert_active(u.u);
+    assert_active(u.u.a);
+    assert_inactive(u.u.b);
+    assert_inactive(u.u2);
+    assert_inactive(u.x);
+    assert_inactive(u.y);
+
     int a = u.u.b; // both-note 2{{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
 
     return 1;
@@ -343,6 +361,21 @@ namespace IndirectField {
   static_assert(s2.f == 7, "");
 }
 
+namespace CtorActivatesFields {
+  struct TailClobberer {
+    constexpr TailClobberer() { b = false; }
+    bool b;
+  };
+
+  class expected {
+    union __union_t {
+      constexpr __union_t() : __unex_() {}
+      TailClobberer __unex_;
+    } __union_;
+  };
+  constexpr expected y;
+}
+
 namespace CopyCtor {
   union U {
     int a;
@@ -579,6 +612,7 @@ namespace MoveOrAssignOp {
   };
 
   class F {
+  public:
     struct __long {
       min_pointer __data_;
     };
@@ -599,6 +633,11 @@ namespace MoveOrAssignOp {
     return true;
   }
   static_assert(foo());
+
+  constexpr F f2{};
+  static_assert(__builtin_is_within_lifetime(&f2.__rep_));
+  static_assert(__builtin_is_within_lifetime(&f2.__rep_.__l));
+  static_assert(__builtin_is_within_lifetime(&f2.__rep_.__l.__data_));
 }
 
 namespace CopyEmptyUnion {
@@ -683,8 +722,6 @@ namespace AnonymousUnion {
     Long L;
   };
 
-#define assert_active(F)   if (!__builtin_is_within_lifetime(&F)) (1/0);
-#define assert_inactive(F) if ( __builtin_is_within_lifetime(&F)) (1/0);
   consteval int test() {
     union UU {
       struct {
@@ -712,6 +749,104 @@ namespace AnonymousUnion {
   }
   static_assert(test() == 1);
 }
+
+namespace AccessViaPointer {
+  struct A {
+    int x;
+    int y;
+    int arr[3];
+    union { int p, q; };
+  };
+  union B {
+    A a;
+    int b;
+  };
+
+  constexpr int write_wrong_member_indirect() { // both-error {{never produces a constant}}
+    B b = {.b = 1};
+    int *p = &b.a.y;
+
+    *p = 12; // both-note 2{{assignment to member 'a' of union with active member 'b'}}
+
+    return *p;
+  }
+  static_assert(write_wrong_member_indirect() == 1); // both-error {{not an integral constant expression}} \
+                                                     // both-note {{in call to}}
+}
+
+namespace Activation {
+  union U {
+    int a;
+    int b;
+  };
+
+  struct S { int& b; };
+
+  constexpr int foo() { // both-error {{never produces a constant expression}}
+    U u;
+    u.a = 10;
+    S s{u.b};
+
+    // LHS is a MemberExpr, but not of a union type. shouldn't activate u.b.
+    s.b = 12; // both-note 2{{assignment to member 'b' of union with active member 'a'}}
+
+    return u.b;
+
+  }
+  static_assert(foo() == 12); // both-error {{not an integral constant expression}} \
+                              // both-note {{in call to}}
+
+  struct SS {
+    int a;
+    consteval SS() {
+      a = 10;
+    }
+  };
+
+  /// Activating the struct should also activate all the struct members.
+  consteval int structInUnion() {
+    union {
+      SS s;
+      int b;
+    } u{};
+
+    // assert_active(u.s);
+    // assert_active(u.s.a);
+    //assert_inactive(u.b);
+
+    return u.s.a;
+  }
+  static_assert(structInUnion() == 10);
+
+}
+
+namespace Activation2 {
+  struct Base {
+    int y;
+  };
+  struct A : Base {
+    int x;
+    int arr[3];
+    union { int p, q; };
+  };
+  union B {
+    A a;
+    int b;
+  };
+
+  constexpr int change_member_indirectly() {
+    B b = {.b = 1};
+    b.a.arr[1] = 1;
+    int &r = b.a.y;
+    r = 123;
+
+    b.b = 2;
+    b.a.y = 3;
+    b.a.arr[2] = 4;
+    return b.a.arr[2];
+  }
+  static_assert(change_member_indirectly() == 4);
+}
 #endif
 
 namespace AddressComparison {
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index c33d53b047c2e..c69113c48806d 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -329,13 +329,17 @@ void foo() {
   }
 }
 
+#define YES 1
+
 namespace call_with_cf_constant {
   void bar(const NSArray *);
   void baz(const NSDictionary *);
+  void boo(NSNumber *);
   void foo() {
     CFArrayCreateMutable(kCFAllocatorDefault, 10);
     bar(@[@"hello"]);
     baz(@{@"hello": @3});
+    boo(@YES);
   }
 }
 
diff --git a/clang/test/Analysis/div-zero-cxx20.cpp b/clang/test/Analysis/div-zero-cxx20.cpp
new file mode 100644
index 0000000000000..00ea96e796777
--- /dev/null
+++ b/clang/test/Analysis/div-zero-cxx20.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core.DivideZero -std=c++20 -verify %s
+
+namespace GH148875 {
+struct A {
+  int x;
+  A(int v) : x(v) {}
+};
+
+struct B {
+  int x;
+  B() : x(0) {}
+};
+
+struct C {
+  int x, y;
+  C(int a, int b) : x(a), y(b) {}
+};
+
+struct D {
+  int x;
+};
+
+struct E {
+  D d;
+  E(int a) : d(a) {}
+};
+
+struct F {
+  int x;
+};
+
+int t1() {
+  A a{42};
+  return 1 / (a.x - 42); // expected-warning {{Division by zero}}
+}
+
+int t2() {
+  B b{};
+  return 1 / b.x; // expected-warning {{Division by zero}}
+}
+
+int t3() {
+  C c1{1, -1};
+  return 1 / (c1.x + c1.y); // expected-warning {{Division by zero}}
+}
+
+int t4() {
+  C c2{0, 0};
+  return 1 / (c2.x + c2.y); // expected-warning {{Division by zero}}
+}
+
+int t5() {
+  E e{32};
+  return 1 / (e.d.x - 32); // expected-warning {{Division by zero}}
+}
+
+int t6() {
+  F f(32);
+  return 1 / (f.x - 32); // expected-warning {{Division by zero}}
+}
+} // namespace GH148875
diff --git a/clang/test/Analysis/div-zero.cpp b/clang/test/Analysis/div-zero.cpp
index 063450d8883b0..51ea25e828a18 100644
--- a/clang/test/Analysis/div-zero.cpp
+++ b/clang/test/Analysis/div-zero.cpp
@@ -11,3 +11,63 @@ int fooPR10616 (int qX ) {
   return (a % (qX-1)); // expected-warning {{Division by zero}}
 
 }
+
+namespace GH148875 {
+struct A {
+  int x;
+  A(int v) : x(v) {}
+};
+
+struct B {
+  int x;
+  B() : x(0) {}
+};
+
+struct C {
+  int x, y;
+  C(int a, int b) : x(a), y(b) {}
+};
+
+struct D {
+  int x;
+};
+
+struct E {
+  D d;
+  E(int a) : d{a} {}
+};
+
+struct F {
+  int x;
+};
+
+int t1() {
+  A a{42};
+  return 1 / (a.x - 42); // expected-warning {{Division by zero}}
+}
+
+int t2() {
+  B b{};
+  return 1 / b.x; // expected-warning {{Division by zero}}
+}
+
+int t3() {
+  C c1{1, -1};
+  return 1 / (c1.x + c1.y); // expected-warning {{Division by zero}}
+}
+
+int t4() {
+  C c2{0, 0};
+  return 1 / (c2.x + c2.y); // expected-warning {{Division by zero}}
+}
+
+int t5() {
+  E e{32};
+  return 1 / (e.d.x - 32); // expected-warning {{Division by zero}}
+}
+
+int t6() {
+  F f{32};
+  return 1 / (f.x - 32); // expected-warning {{Division by zero}}
+}
+}
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 758b40cca4931..10e6e5381ee6d 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -519,14 +519,36 @@ void reopen_std_stream(void) {
   if (!fp) return;
 
   stdout = fp; // Let's make them alias.
-  clang_analyzer_eval(fp == oldStdout);     // expected-warning {{UNKNOWN}}
-  clang_analyzer_eval(fp == stdout);        // expected-warning {{TRUE}} no-FALSE
-  clang_analyzer_eval(oldStdout == stdout); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(fp == oldStdout);     // expected-warning {{FALSE}}
+  clang_analyzer_eval(fp == stdout);        // expected-warning {{TRUE}}
+  clang_analyzer_eval(oldStdout == stdout); // expected-warning {{FALSE}}
 }
 
 void only_success_path_does_not_alias_with_stdout(void) {
   if (stdout) return;
   FILE *f = fopen("/tmp/foof", "r"); // no-crash
+  clang_analyzer_eval(f == 0);// expected-warning {{TRUE}} expected-warning {{FALSE}}
   if (!f) return;
   fclose(f);
 }
+
+extern void do_something();
+
+void test_no_invalidate_at_system_call() {
+  FILE *old_stdin = stdin;
+  if ((stdin = fopen("x/y/z", "r")) == NULL)
+    return;
+  clang_analyzer_eval(old_stdin == stdin); // expected-warning{{FALSE}}
+  free(malloc(100));
+  clang_analyzer_eval(old_stdin == stdin); // expected-warning{{FALSE}}
+  fclose(stdin);
+}
+
+void test_invalidate_at_non_system_call() {
+  FILE *old_stdin = stdin;
+  if ((stdin = fopen("x/y/z", "r")) == NULL)
+    return;
+  clang_analyzer_eval(old_stdin == stdin); // expected-warning{{FALSE}}
+  do_something();
+  clang_analyzer_eval(old_stdin == stdin); // expected-warning{{UNKNOWN}}
+}
diff --git a/clang/test/CIR/CodeGen/builtin_bit.cpp b/clang/test/CIR/CodeGen/builtin_bit.cpp
index f017b6eb51971..4ac82bd749e8a 100644
--- a/clang/test/CIR/CodeGen/builtin_bit.cpp
+++ b/clang/test/CIR/CodeGen/builtin_bit.cpp
@@ -416,3 +416,141 @@ unsigned long long test_builtin_bswap64(unsigned long long x) {
 
 // OGCG-LABEL: @_Z20test_builtin_bswap64y
 // OGCG:         %{{.+}} = call i64 @llvm.bswap.i64(i64 %{{.+}})
+
+unsigned char test_builtin_rotateleft8(unsigned char x, unsigned char y) {
+  return __builtin_rotateleft8(x, y);
+}
+
+// CIR-LABEL: @_Z24test_builtin_rotateleft8hh
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u8i
+
+// LLVM-LABEL: @_Z24test_builtin_rotateleft8hh
+// LLVM:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %{{.+}} = call i8 @llvm.fshl.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z24test_builtin_rotateleft8hh
+// OGCG:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %{{.+}} = call i8 @llvm.fshl.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+unsigned short test_builtin_rotateleft16(unsigned short x, unsigned short y) {
+  return __builtin_rotateleft16(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateleft16tt
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u16i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateleft16tt
+// LLVM:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %{{.+}} = call i16 @llvm.fshl.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateleft16tt
+// OGCG:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %{{.+}} = call i16 @llvm.fshl.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+unsigned test_builtin_rotateleft32(unsigned x, unsigned y) {
+  return __builtin_rotateleft32(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateleft32jj
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u32i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateleft32jj
+// LLVM:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %{{.+}} = call i32 @llvm.fshl.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateleft32jj
+// OGCG:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %{{.+}} = call i32 @llvm.fshl.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+unsigned long long test_builtin_rotateleft64(unsigned long long x,
+                                             unsigned long long y) {
+  return __builtin_rotateleft64(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateleft64yy
+// CIR:         %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u64i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateleft64yy
+// LLVM:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %{{.+}} = call i64 @llvm.fshl.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateleft64yy
+// OGCG:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %{{.+}} = call i64 @llvm.fshl.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
+
+unsigned char test_builtin_rotateright8(unsigned char x, unsigned char y) {
+  return __builtin_rotateright8(x, y);
+}
+
+// CIR-LABEL: @_Z25test_builtin_rotateright8hh
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u8i
+
+// LLVM-LABEL: @_Z25test_builtin_rotateright8hh
+// LLVM:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// LLVM-NEXT:    %{{.+}} = call i8 @llvm.fshr.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z25test_builtin_rotateright8hh
+// OGCG:         %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1
+// OGCG-NEXT:    %{{.+}} = call i8 @llvm.fshr.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]])
+
+unsigned short test_builtin_rotateright16(unsigned short x, unsigned short y) {
+  return __builtin_rotateright16(x, y);
+}
+
+// CIR-LABEL: @_Z26test_builtin_rotateright16tt
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u16i
+
+// LLVM-LABEL: @_Z26test_builtin_rotateright16tt
+// LLVM:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// LLVM-NEXT:    %{{.+}} = call i16 @llvm.fshr.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z26test_builtin_rotateright16tt
+// OGCG:         %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2
+// OGCG-NEXT:    %{{.+}} = call i16 @llvm.fshr.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]])
+
+unsigned test_builtin_rotateright32(unsigned x, unsigned y) {
+  return __builtin_rotateright32(x, y);
+}
+
+// CIR-LABEL: @_Z26test_builtin_rotateright32jj
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u32i
+
+// LLVM-LABEL: @_Z26test_builtin_rotateright32jj
+// LLVM:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// LLVM-NEXT:    %{{.+}} = call i32 @llvm.fshr.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z26test_builtin_rotateright32jj
+// OGCG:         %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4
+// OGCG-NEXT:    %{{.+}} = call i32 @llvm.fshr.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]])
+
+unsigned long long test_builtin_rotateright64(unsigned long long x,
+                                              unsigned long long y) {
+  return __builtin_rotateright64(x, y);
+}
+
+// CIR-LABEL: @_Z26test_builtin_rotateright64yy
+// CIR:         %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u64i
+
+// LLVM-LABEL: @_Z26test_builtin_rotateright64yy
+// LLVM:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// LLVM-NEXT:    %{{.+}} = call i64 @llvm.fshr.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
+
+// OGCG-LABEL: @_Z26test_builtin_rotateright64yy
+// OGCG:         %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8
+// OGCG-NEXT:    %{{.+}} = call i64 @llvm.fshr.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]])
diff --git a/clang/test/CIR/CodeGen/complex-builtins.cpp b/clang/test/CIR/CodeGen/complex-builtins.cpp
index f0d12d0ef6663..811af47a704f5 100644
--- a/clang/test/CIR/CodeGen/complex-builtins.cpp
+++ b/clang/test/CIR/CodeGen/complex-builtins.cpp
@@ -83,3 +83,39 @@ void foo3() {
 // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
 // OGCG: %[[A_IMAG:.*]] = load double, ptr %[[A_IMAG_PTR]], align 8
 // OGCG: store double %[[A_IMAG]], ptr %[[INIT]], align 8
+
+void foo4() {
+  float _Complex a;
+  float _Complex b = __builtin_conjf(a);
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[RESULT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[COMPLEX]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[RESULT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float  %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/complex-unary.cpp b/clang/test/CIR/CodeGen/complex-unary.cpp
new file mode 100644
index 0000000000000..676b5546d28e0
--- /dev/null
+++ b/clang/test/CIR/CodeGen/complex-unary.cpp
@@ -0,0 +1,286 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-canonicalize -o %t.cir %s 2>&1 | FileCheck --check-prefix=CIR-BEFORE %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-after=cir-lowering-prepare -o %t.cir %s 2>&1 | FileCheck --check-prefixes=CIR-AFTER %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+void foo() {
+  int _Complex a;
+  int _Complex b = ~a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex<!s32i>, !cir.complex<!s32i>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!s32i> -> !s32i
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!s32i> -> !s32i
+// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !s32i, !s32i
+// CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !s32i -> !cir.complex<!s32i>
+// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { i32, i32 }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { i32, i32 } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { i32, i32 } %[[TMP]], 1
+// LLVM: %[[IMAG_MINUS:.*]] = sub i32 0, %[[IMAG]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { i32, i32 } {{.*}}, i32 %[[REAL]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { i32, i32 } %[[RESULT_TMP]], i32 %[[IMAG_MINUS]], 1
+// LLVM: store { i32, i32 } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load i32, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load i32, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_IMAG_MINUS:.*]] = sub i32 0, %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store i32 %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store i32 %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo2() {
+  float _Complex a;
+  float _Complex b = ~a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo3() {
+  float _Complex a;
+  float _Complex b = a++;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_INC:.*]] = fadd float 1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_INC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[TMP]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_INC:.*]] = fadd float %[[A_REAL]], 1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_INC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo4() {
+  float _Complex a;
+  float _Complex b = ++a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_INC:.*]] = fadd float 1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_INC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_INC:.*]] = fadd float %[[A_REAL]], 1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_INC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_INC]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo5() {
+  float _Complex a;
+  float _Complex b = a--;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_DEC:.*]] = fadd float -1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_DEC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[TMP]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_DEC:.*]] = fadd float %[[A_REAL]], -1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_DEC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
+
+void foo6() {
+  float _Complex a;
+  float _Complex b = --a;
+}
+
+// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[REAL_DEC:.*]] = fadd float -1.000000e+00, %[[REAL]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_DEC]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_DEC:.*]] = fadd float %[[A_REAL]], -1.000000e+00
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_DEC]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL_DEC]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index 17d5c2fc2e210..8d39018d8926c 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -107,6 +107,8 @@ void f() {
   constexpr int p = &*a;
   // since-cxx11-error@-1 {{cannot initialize a variable of type 'const int' with an rvalue of type 'A *'}}
   constexpr A *p2 = &*a;
+  // since-cxx11-error@-1 {{constexpr variable 'p2' must be initialized by a constant expression}}
+  // since-cxx11-note@-2 {{dereferencing a null pointer}}
 }
 
 struct A {
diff --git a/clang/test/CXX/drs/cwg8xx.cpp b/clang/test/CXX/drs/cwg8xx.cpp
index ecb9113ccfe66..7395f04c8e399 100644
--- a/clang/test/CXX/drs/cwg8xx.cpp
+++ b/clang/test/CXX/drs/cwg8xx.cpp
@@ -9,10 +9,10 @@
 namespace cwg820 { // cwg820: 2.7
 export template <class T> struct B {};
 // cxx98-17-warning@-1 {{exported templates are unsupported}}
-// since-cxx20-error@-2 {{export declaration can only be used within a module purview}}
+// since-cxx20-error@-2 {{export declaration can only be used within a module interface}}
 export template<typename T> void f() {}
 // cxx98-17-warning@-1 {{exported templates are unsupported}}
-// since-cxx20-error@-2 {{export declaration can only be used within a module purview}}
+// since-cxx20-error@-2 {{export declaration can only be used within a module interface}}
 } // namespace cwg820
 
 namespace cwg873 { // cwg873: 3.0
diff --git a/clang/test/CXX/expr/expr.const/p2-0x.cpp b/clang/test/CXX/expr/expr.const/p2-0x.cpp
index c6c3381be5523..910c8635f7353 100644
--- a/clang/test/CXX/expr/expr.const/p2-0x.cpp
+++ b/clang/test/CXX/expr/expr.const/p2-0x.cpp
@@ -199,15 +199,15 @@ namespace UndefinedBehavior {
 
     constexpr A *na = nullptr;
     constexpr B *nb = nullptr;
-    constexpr A &ra = *nb; // expected-error {{constant expression}} expected-note {{cannot access base class of null pointer}}
-    constexpr B &rb = (B&)*na; // expected-error {{constant expression}} expected-note {{cannot access derived class of null pointer}}
+    constexpr A &ra = *nb; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
+    constexpr B &rb = (B&)*na; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
     static_assert((A*)nb == 0, "");
     static_assert((B*)na == 0, "");
     constexpr const int &nf = nb->n; // expected-error {{constant expression}} expected-note {{cannot access field of null pointer}}
     constexpr const int &mf = nb->m; // expected-error {{constant expression}} expected-note {{cannot access field of null pointer}}
     constexpr const int *np1 = (int*)nullptr + 0; // ok
-    constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok
-    constexpr const int *np3 = &(*(int(*)[4])nullptr)[2]; // expected-error {{constant expression}} expected-note {{cannot perform pointer arithmetic on null pointer}}
+    constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
+    constexpr const int *np3 = &(*(int(*)[4])nullptr)[2]; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}}
 
     struct C {
       constexpr int f() const { return 0; }
@@ -485,7 +485,7 @@ namespace std {
 namespace TypeId {
   struct S { virtual void f(); };
   constexpr S *p = 0;
-  constexpr const std::type_info &ti1 = typeid(*p); // expected-error {{must be initialized by a constant expression}} cxx11-note {{typeid applied to expression of polymorphic type 'S'}} cxx20-note {{dereferenced null pointer}}
+  constexpr const std::type_info &ti1 = typeid(*p); // expected-error {{must be initialized by a constant expression}} cxx11-note {{typeid applied to expression of polymorphic type 'S'}} cxx20-note {{dereferencing a null pointer}}
 
   struct T {} t;
   constexpr const std::type_info &ti2 = typeid(t);
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
index 2158d7fa84b86..ebc76ad16467d 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
@@ -9,7 +9,7 @@
 
 //--- ExportDeclNotInModulePurview.cppm
 // expected-error@* {{missing 'export module' declaration in module interface unit}}
-export int b; // expected-error {{export declaration can only be used within a module purview}}
+export int b; // expected-error {{export declaration can only be used within a module interface}}
 
 //--- A.cppm
 // expected-no-diagnostics
@@ -18,7 +18,7 @@ export int a;
 
 //--- AddExport.cppm
 module A; // #module-decl
-export int b; // expected-error {{export declaration can only be used within a module purview}}
+export int b; // expected-error {{export declaration can only be used within a module interface}}
 // expected-note@#module-decl {{add 'export' here}}
 
 //--- AddExport2.cppm
diff --git a/clang/test/CXX/module/module.interface/p1.cpp b/clang/test/CXX/module/module.interface/p1.cpp
index 54a201e502323..1754d9ea14618 100644
--- a/clang/test/CXX/module/module.interface/p1.cpp
+++ b/clang/test/CXX/module/module.interface/p1.cpp
@@ -1,28 +1,19 @@
-// RUN: %clang_cc1 -std=c++2a %s -DERRORS -verify
-// RUN: %clang_cc1 -std=c++2a %s -emit-module-interface -o %t.pcm
-// RUN: %clang_cc1 -std=c++2a %s -fmodule-file=M=%t.pcm -DIMPLEMENTATION -verify -Db=b2 -Dc=c2
+// RUN: rm -rf %t
+// RUN: split-file %s %t
 
-module;
-
-#ifdef ERRORS
-export int a; // expected-error {{export declaration can only be used within a module purview}}
-#endif
-
-#ifndef IMPLEMENTATION
-export
-#else
-// expected-error@#1 {{export declaration can only be used within a module purview}}
-// expected-error@#2 {{export declaration can only be used within a module purview}}
-// expected-note@+2 1+{{add 'export'}}
-#endif
-module M;
+// RUN: %clang_cc1 -std=c++2a %t/errors.cpp -verify
+// RUN: %clang_cc1 -std=c++2a %t/M.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++2a %t/impl.cpp -fmodule-file=M=%t/M.pcm -verify
 
+//--- errors.cpp
+module;
+export int a; // expected-error {{export declaration can only be used within a module interface}}
+export module M;
 export int b; // #1
 namespace N {
   export int c; // #2
 }
 
-#ifdef ERRORS
 namespace { // expected-note 2{{anonymous namespace begins here}}
   export int d1; // expected-error {{export declaration appears within anonymous namespace}}
   namespace X {
@@ -35,4 +26,19 @@ export { export int f; } // expected-error {{within another export declaration}}
 
 module :private; // expected-note {{private module fragment begins here}}
 export int priv; // expected-error {{export declaration cannot be used in a private module fragment}}
-#endif
+
+//--- M.cppm
+export module M;
+export int b;
+namespace N {
+  export int c;
+}
+
+//--- impl.cpp
+module M; // #M
+
+export int b2; // expected-error {{export declaration can only be used within a module interface}}
+namespace N {
+  export int c2; // expected-error {{export declaration can only be used within a module interface}}
+}
+// expected-note@#M 2+{{add 'export'}}
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index 41f13155847ba..4aafc09602228 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -92,3 +92,19 @@ void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsi
   __builtin_mma_pmdmxvi8gerx4spp(&vdmr, vp, vc, 0, 0, 0);
   *((__dmr1024 *)resp) = vdmr;
 }
+
+// CHECK-LABEL: @test_dmf_basic
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> [[TMP0]])
+// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr %res1, align 128
+// CHECK-NEXT: [[TMP2:%.*]] = load <1024 x i1>, ptr %res2, align 128
+// CHECK-NEXT: [[TMP3:%.*]] = load <1024 x i1>, ptr %p, align 128
+// CHECK-NEXT: [[TMP4:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> [[TMP2]], <1024 x i1> [[TMP3]])
+// CHECK-NEXT: store <1024 x i1> [[TMP4]], ptr %res2, align 128
+void test_dmf_basic(char *p, char *res1, char *res2) {
+  __dmr1024 x[2];
+  __builtin_mma_dmsetdmrz(&x[0]);
+  __builtin_mma_dmmr((__dmr1024*)res1, &x[0]);
+  __builtin_mma_dmxor((__dmr1024*)res2, (__dmr1024*)p);
+}
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
index 838db02415fe5..b46fa9f2cf157 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c
@@ -11,7 +11,7 @@
 // RUN: -S -ffp-exception-behavior=strict \
 // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \
 // RUN: --check-prefix=FIXME-CHECK  %s
-// RUN: %clang_cc1 -triple powerpcspe -ffp-exception-behavior=strict \
+// RUN: %clang_cc1 -triple powerpc -ffp-exception-behavior=strict \
 // RUN: -target-feature +vsx -fexperimental-strict-floating-point -emit-llvm \
 // RUN: %s -o - | FileCheck --check-prefix=CHECK-CONSTRAINED %s
 
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-future-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-future-builtin-err.c
new file mode 100644
index 0000000000000..9def39f5fa479
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-future-builtin-err.c
@@ -0,0 +1,15 @@
+// RUN: not %clang_cc1 -triple powerpc64le-unknown-linux-gnu -target-cpu pwr10 \
+// RUN:   %s -emit-llvm-only 2>&1 | FileCheck %s
+
+__attribute__((target("no-mma")))
+void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __vector_pair vp = *((__vector_pair *)vpp);
+  __builtin_mma_dmsetdmrz(&vdmr);
+  __builtin_mma_dmmr(&vdmr, (__dmr1024*)vpp);
+  __builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
+
+// CHECK: error: '__builtin_mma_dmsetdmrz' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmmr' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
+}
diff --git a/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
similarity index 75%
rename from clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c
rename to clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
index 1b8d345ac7ec7..c02274696244a 100644
--- a/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -11,6 +11,9 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
   __builtin_mma_pmdmxvi8gerx4pp(&vdmr, vp, vc, 0, 0, 0);
   __builtin_mma_dmxvi8gerx4spp(&vdmr, vp, vc);
   __builtin_mma_pmdmxvi8gerx4spp(&vdmr, vp, vc, 0, 0, 0);
+  __builtin_mma_dmsetdmrz(&vdmr);
+  __builtin_mma_dmmr(&vdmr, (__dmr1024*)vpp);
+  __builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
 
 // CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
 // CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
@@ -18,4 +21,7 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
 // CHECK: error: '__builtin_mma_pmdmxvi8gerx4pp' needs target feature mma,paired-vector-memops
 // CHECK: error: '__builtin_mma_dmxvi8gerx4spp' needs target feature mma,paired-vector-memops
 // CHECK: error: '__builtin_mma_pmdmxvi8gerx4spp' needs target feature mma,paired-vector-memops
+// CHECK: error: '__builtin_mma_dmsetdmrz' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmmr' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
 }
diff --git a/clang/test/CodeGen/RISCV/riscv-xandesbfhcvt-c-api.c b/clang/test/CodeGen/RISCV/riscv-xandesbfhcvt-c-api.c
new file mode 100644
index 0000000000000..9feeafe42c79f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-xandesbfhcvt-c-api.c
@@ -0,0 +1,25 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple riscv32 -target-feature +xandesbfhcvt -emit-llvm %s -o - \
+// RUN:   -disable-O0-optnone | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xandesbfhcvt -emit-llvm %s -o - \
+// RUN:   -disable-O0-optnone | opt -S -passes=mem2reg | FileCheck %s
+
+#include <riscv_nds.h>
+
+// CHECK-LABEL: @test_fcvt_s_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fpext bfloat [[BF:%.*]] to float
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_fcvt_s_bf16(__bf16 bf) {
+  return __riscv_nds_fcvt_s_bf16(bf);
+}
+
+// CHECK-LABEL: @test_fcvt_bf16_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[SF:%.*]] to bfloat
+// CHECK-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_fcvt_bf16_s(float sf) {
+  return __riscv_nds_fcvt_bf16_s(sf);
+}
diff --git a/clang/test/CodeGen/RISCV/riscv-xandesbfhcvt.c b/clang/test/CodeGen/RISCV/riscv-xandesbfhcvt.c
new file mode 100644
index 0000000000000..2f5e754d38704
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv-xandesbfhcvt.c
@@ -0,0 +1,23 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple riscv32 -target-feature +xandesbfhcvt -emit-llvm %s -o - \
+// RUN:   -disable-O0-optnone | opt -S -passes=mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xandesbfhcvt -emit-llvm %s -o - \
+// RUN:   -disable-O0-optnone | opt -S -passes=mem2reg | FileCheck %s
+
+// CHECK-LABEL: @test_fcvt_s_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fpext bfloat [[BF:%.*]] to float
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float test_fcvt_s_bf16(__bf16 bf) {
+  return __builtin_riscv_nds_fcvt_s_bf16(bf);
+}
+
+// CHECK-LABEL: @test_fcvt_bf16_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[SF:%.*]] to bfloat
+// CHECK-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_fcvt_bf16_s(float sf) {
+  return __builtin_riscv_nds_fcvt_bf16_s(sf);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/non-overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/non-overloaded/vcpop.c
similarity index 100%
rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/non-overloaded/vcpopv.c
rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/non-overloaded/vcpop.c
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/overloaded/vcpop.c
similarity index 100%
rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/overloaded/vcpopv.c
rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/non-policy/overloaded/vcpop.c
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/non-overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/non-overloaded/vcpop.c
similarity index 100%
rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/non-overloaded/vcpopv.c
rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/non-overloaded/vcpop.c
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/overloaded/vcpop.c
similarity index 100%
rename from clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/overloaded/vcpopv.c
rename to clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/vector-crypto/policy/overloaded/vcpop.c
diff --git a/clang/test/CodeGen/X86/i128-debuginfo.c b/clang/test/CodeGen/X86/i128-debuginfo.c
new file mode 100644
index 0000000000000..4b865c1bed9f0
--- /dev/null
+++ b/clang/test/CodeGen/X86/i128-debuginfo.c
@@ -0,0 +1,10 @@
+// no autogeneration since update_cc_test_checks does not support -g
+// RUN: %clang_cc1 -triple x86_64-pc-linux -O1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: define{{.*}} i128 @add(i128 noundef %a)
+// CHECK: #dbg_value(i128 %a, ![[DI:.*]], !DIExpression()
+__int128_t add(__int128_t a) {
+  return a + a;
+}
+
+// CHECK: ![[DI]] = !DILocalVariable(name: "a", arg: 1
diff --git a/clang/test/CodeGen/X86/x86_64-arguments-nacl.c b/clang/test/CodeGen/X86/x86_64-arguments-nacl.c
deleted file mode 100644
index 4d820aef8fd2d..0000000000000
--- a/clang/test/CodeGen/X86/x86_64-arguments-nacl.c
+++ /dev/null
@@ -1,92 +0,0 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-nacl -emit-llvm -o - %s| FileCheck %s
-#include <stdarg.h>
-// Test for x86-64 structure representation (instead of pnacl representation),
-// in particular for unions. Also crib a few tests from x86 Linux.
-
-union PP_VarValue {
-  int as_int;
-  double as_double;
-  long long as_i64;
-};
-
-struct PP_Var {
-  int type;
-  int padding;
-  union PP_VarValue value;
-};
-
-// CHECK: define{{.*}} { i64, i64 } @f0()
-struct PP_Var f0(void) {
-  struct PP_Var result = { 0, 0, 0 };
-  return result;
-}
-
-// CHECK-LABEL: define{{.*}} void @f1(i64 %p1.coerce0, i64 %p1.coerce1)
-void f1(struct PP_Var p1) { while(1) {} }
-
-// long doubles are 64 bits on NaCl
-// CHECK-LABEL: define{{.*}} double @f5()
-long double f5(void) {
-  return 0;
-}
-
-// CHECK-LABEL: define{{.*}} void @f6(i8 noundef signext %a0, i16 noundef signext %a1, i32 noundef %a2, i64 noundef %a3, ptr noundef %a4)
-void f6(char a0, short a1, int a2, long long a3, void *a4) {
-}
-
-// CHECK-LABEL: define{{.*}} i64 @f8_1()
-// CHECK-LABEL: define{{.*}} void @f8_2(i64 %a0.coerce)
-union u8 {
-  long double a;
-  int b;
-};
-union u8 f8_1(void) { while (1) {} }
-void f8_2(union u8 a0) {}
-
-// CHECK-LABEL: define{{.*}} i64 @f9()
-struct s9 { int a; int b; int : 0; } f9(void) { while (1) {} }
-
-// CHECK-LABEL: define{{.*}} void @f10(i64 %a0.coerce)
-struct s10 { int a; int b; int : 0; };
-void f10(struct s10 a0) {}
-
-// CHECK-LABEL: define{{.*}} double @f11()
-union { long double a; float b; } f11(void) { while (1) {} }
-
-// CHECK-LABEL: define{{.*}} i32 @f12_0()
-// CHECK-LABEL: define{{.*}} void @f12_1(i32 %a0.coerce)
-struct s12 { int a __attribute__((aligned(16))); };
-struct s12 f12_0(void) { while (1) {} }
-void f12_1(struct s12 a0) {}
-
-// Check that sret parameter is accounted for when checking available integer
-// registers.
-// CHECK: define{{.*}} void @f13(ptr dead_on_unwind noalias writable sret(%struct.s13_0) align 8 %agg.result, i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, ptr noundef byval({{.*}}) align 8 %e, i32 noundef %f)
-
-struct s13_0 { long long f0[3]; };
-struct s13_1 { long long f0[2]; };
-struct s13_0 f13(int a, int b, int c, int d,
-                 struct s13_1 e, int f) { while (1) {} }
-
-// CHECK-LABEL: define{{.*}} void @f20(ptr noundef byval(%struct.s20) align 32 %x)
-struct __attribute__((aligned(32))) s20 {
-  int x;
-  int y;
-};
-void f20(struct s20 x) {}
-
-
-// CHECK: declare void @func(i64)
-typedef struct _str {
-  union {
-    long double a;
-    long c;
-  };
-} str;
-
-void func(str s);
-str ss;
-void f9122143(void)
-{
-  func(ss);
-}
diff --git a/clang/test/CodeGen/X86/x86_64-arguments.c b/clang/test/CodeGen/X86/x86_64-arguments.c
index 82845f0a2b31f..580f9487395d3 100644
--- a/clang/test/CodeGen/X86/x86_64-arguments.c
+++ b/clang/test/CodeGen/X86/x86_64-arguments.c
@@ -551,6 +551,45 @@ struct s68 {
 void f68(struct s68 x) {
 }
 
+// CHECK-LABEL: define{{.*}} i128 @f69(i128 noundef %a)
+__int128_t f69(__int128_t a) {
+  return a;
+}
+
+// CHECK-LABEL: define{{.*}} i128 @f70(i128 noundef %a)
+__uint128_t f70(__uint128_t a) {
+  return a;
+}
+
+// check that registers are correctly counted for (u)int128_t arguments
+struct s71 {
+  long long a, b;
+};
+// CHECK-LABEL: define{{.*}} void @f71(i128 noundef %a, i128 noundef %b, i64 noundef %c, ptr noundef byval(%struct.s71) align 8 %d)
+void f71(__int128_t a, __int128_t b, long long c, struct s71 d) {
+}
+// CHECK-LABEL: define{{.*}} void @f72(i128 noundef %a, i128 noundef %b, i64 %d.coerce0, i64 %d.coerce1)
+void f72(__int128_t a, __int128_t b, struct s71 d) {
+}
+
+// check that structs containing (u)int128_t are passed correctly
+struct s73 {
+  struct inner {
+    __uint128_t a;
+  };
+  struct inner in;
+};
+// CHECK-LABEL: define{{.*}} i128 @f73(i128 %a.coerce)
+struct s73 f73(struct s73 a) {
+  return a;
+}
+
+// check that _BitInt(128) is still passed correctly on the stack
+// CHECK-LABEL: define{{.*}} i128 @f74(i128 noundef %b, i128 noundef %c, i128 noundef %d, i64 noundef %e, ptr noundef byval(i128) align 8 %0)
+_BitInt(128) f74(__uint128_t b, __uint128_t c, __uint128_t d, long e, _BitInt(128) a) {
+  return a;
+}
+
 /// The synthesized __va_list_tag does not have file/line fields.
 // CHECK:      = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "__va_list_tag",
 // CHECK-NOT:  file:
diff --git a/clang/test/CodeGen/X86/x86_64-longdouble.c b/clang/test/CodeGen/X86/x86_64-longdouble.c
index 7446664bef5bb..5287640e4ed30 100644
--- a/clang/test/CodeGen/X86/x86_64-longdouble.c
+++ b/clang/test/CodeGen/X86/x86_64-longdouble.c
@@ -4,9 +4,6 @@
 // RUN:    | FileCheck %s --check-prefix=GNU --check-prefix=CHECK
 // RUN: %clang_cc1 -triple x86_64 -emit-llvm -O -o - %s \
 // RUN:    | FileCheck %s --check-prefix=GNU --check-prefix=CHECK
-// NaCl is an example of a target for which long double is the same as double.
-// RUN: %clang_cc1 -triple x86_64-nacl -emit-llvm -O -o - %s \
-// RUN:    | FileCheck %s --check-prefix=NACL --check-prefix=CHECK
 
 // Android uses fp128 for long double but other x86_64 targets use x86_fp80.
 
@@ -22,14 +19,12 @@ long double TestLD(long double x) {
   return x * x;
 // ANDROID: define{{.*}} fp128 @TestLD(fp128 noundef %x)
 // GNU: define{{.*}} x86_fp80 @TestLD(x86_fp80 noundef %x)
-// NACL: define{{.*}} double @TestLD(double noundef %x)
 }
 
 long double _Complex TestLDC(long double _Complex x) {
   return x * x;
 // ANDROID: define{{.*}} void @TestLDC(ptr {{.*}}, ptr {{.*}} %x)
 // GNU: define{{.*}} { x86_fp80, x86_fp80 } @TestLDC(ptr {{.*}} %x)
-// NACL: define{{.*}} { double, double } @TestLDC(double noundef %x{{.*}}, double noundef %x{{.*}})
 }
 
 typedef __builtin_va_list va_list;
@@ -60,14 +55,11 @@ long double TestGetVarLD(va_list ap) {
 // memory.
 // ANDROID: define{{.*}} fp128 @TestGetVarLD(
 // GNU:     define{{.*}} x86_fp80 @TestGetVarLD(
-// NACL:     define{{.*}} double @TestGetVarLD(
 // ANDROID: br label
 // ANDROID: br label
-// NACL: br
 // ANDROID: = phi
 // GNU-NOT: br
 // GNU-NOT: = phi
-// NACL: = phi
 // ANDROID: ret fp128
 // GNU:     ret x86_fp80
 }
@@ -78,16 +70,12 @@ long double _Complex TestGetVarLDC(va_list ap) {
 // ANDROID:   define{{.*}} void @TestGetVarLDC(ptr {{.*}}, ptr
 // GNU:       define{{.*}} { x86_fp80, x86_fp80 } @TestGetVarLDC(
 // Pair of double can go in SSE registers or memory
-// NACL:       define{{.*}} { double, double } @TestGetVarLDC(
 // ANDROID-NOT: br
 // GNU-NOT: br
-// NACL: br
 // ANDROID-NOT: phi
 // GNU-NOT: phi
-// NACL: phi
 // ANDROID:   ret void
 // GNU:       ret { x86_fp80, x86_fp80 }
-// NACL:       ret { double, double }
 }
 
 void TestVarArg(const char *s, ...);
@@ -116,8 +104,6 @@ void TestPassVarLD(long double x) {
 // ANDROID: call {{.*}} @TestVarArg(ptr {{.*}}, fp128 noundef %x
 // GNU: define{{.*}} void @TestPassVarLD(x86_fp80 noundef %x)
 // GNU: call {{.*}} @TestVarArg(ptr {{.*}}, x86_fp80 noundef %x
-// NACL: define{{.*}} void @TestPassVarLD(double noundef %x)
-// NACL: call {{.*}} @TestVarArg(ptr {{.*}}, double noundef %x
 }
 
 void TestPassVarLDC(long double _Complex x) {
@@ -130,6 +116,4 @@ void TestPassVarLDC(long double _Complex x) {
 // GNU:          store x86_fp80 %{{.*}}, ptr %
 // GNU-NEXT:     store x86_fp80 %{{.*}}, ptr %
 // GNU-NEXT:   call {{.*}} @TestVarArg(ptr {{.*}}, ptr {{.*}} %
-// NACL:      define{{.*}} void @TestPassVarLDC(double noundef %x{{.*}}, double noundef %x{{.*}})
-// NACL: call {{.*}} @TestVarArg(ptr {{.*}}, double noundef %x{{.*}}, double noundef %x{{.*}})
 }
diff --git a/clang/test/CodeGen/alloc-align-attr.c b/clang/test/CodeGen/alloc-align-attr.c
index 76e5d1041b19f..c4c4e76eaaa04 100644
--- a/clang/test/CodeGen/alloc-align-attr.c
+++ b/clang/test/CodeGen/alloc-align-attr.c
@@ -70,66 +70,42 @@ __INT32_TYPE__ test4(__SIZE_TYPE__ a) {
 
 struct Empty {};
 struct MultiArgs { __INT64_TYPE__ a, b;};
-// Struct parameter doesn't take up an IR parameter, 'i' takes up 2.
+// Struct parameter doesn't take up an IR parameter, 'i' takes up 1.
 // Truncation to i64 is permissible, since alignments of greater than 2^64 are insane.
 __INT32_TYPE__ *m3(struct Empty s, __int128_t i) __attribute__((alloc_align(2)));
 // CHECK-LABEL: @test5(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[A:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[E:%.*]] = alloca [[STRUCT_EMPTY:%.*]], align 1
-// CHECK-NEXT:    [[COERCE:%.*]] = alloca i128, align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1
-// CHECK-NEXT:    store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[A1:%.*]] = load i128, ptr [[A]], align 16
-// CHECK-NEXT:    store i128 [[A1]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[COERCE]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m3(i64 noundef [[TMP4]], i64 noundef [[TMP6]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64
+// CHECK-NEXT:    store i128 [[A:%.*]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m3(i128 noundef [[TMP0]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64
 // CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ]
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CALL]], align 4
-// CHECK-NEXT:    ret i32 [[TMP7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CALL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 __INT32_TYPE__ test5(__int128_t a) {
   struct Empty e;
   return *m3(e, a);
 }
-// Struct parameter takes up 2 parameters, 'i' takes up 2.
+// Struct parameter takes up 2 parameters, 'i' takes up 1.
 __INT32_TYPE__ *m4(struct MultiArgs s, __int128_t i) __attribute__((alloc_align(2)));
 // CHECK-LABEL: @test6(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[A:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[E:%.*]] = alloca [[STRUCT_MULTIARGS:%.*]], align 8
-// CHECK-NEXT:    [[COERCE:%.*]] = alloca i128, align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1
-// CHECK-NEXT:    store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[A1:%.*]] = load i128, ptr [[A]], align 16
-// CHECK-NEXT:    store i128 [[A1]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0
+// CHECK-NEXT:    store i128 [[A:%.*]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[COERCE]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m4(i64 [[TMP4]], i64 [[TMP6]], i64 noundef [[TMP8]], i64 noundef [[TMP10]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m4(i64 [[TMP2]], i64 [[TMP4]], i128 noundef [[TMP0]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64
 // CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ]
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CALL]], align 4
-// CHECK-NEXT:    ret i32 [[TMP11]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CALL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP5]]
 //
 __INT32_TYPE__ test6(__int128_t a) {
   struct MultiArgs e;
diff --git a/clang/test/CodeGen/arm-aapcs-vfp.c b/clang/test/CodeGen/arm-aapcs-vfp.c
index 9fae33f476d35..6581929f99f14 100644
--- a/clang/test/CodeGen/arm-aapcs-vfp.c
+++ b/clang/test/CodeGen/arm-aapcs-vfp.c
@@ -5,12 +5,6 @@
 // RUN:   -ffreestanding \
 // RUN:   -emit-llvm -w -o - %s | FileCheck %s
 
-// RUN: %clang_cc1 -triple armv7-unknown-nacl-gnueabi \
-// RUN:  -target-cpu cortex-a8 \
-// RUN:  -mfloat-abi hard \
-// RUN:  -ffreestanding \
-// RUN:  -emit-llvm -w -o - %s | FileCheck %s
-
 // RUN: %clang_cc1 -triple arm64-apple-darwin9 -target-feature +neon \
 // RUN:   -ffreestanding \
 // RUN:   -emit-llvm -w -o - %s | FileCheck -check-prefix=CHECK64 %s
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index eda6c67fdad00..aa9965b815983 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -956,36 +956,24 @@ void test_builtin_os_log_errno(void) {
 void test_builtin_os_log_long_double(void *buf, long double ld) {
   // CHECK: %[[BUF_ADDR:.*]] = alloca ptr, align 8
   // CHECK: %[[LD_ADDR:.*]] = alloca x86_fp80, align 16
-  // CHECK: %[[COERCE:.*]] = alloca i128, align 16
   // CHECK: store ptr %[[BUF]], ptr %[[BUF_ADDR]], align 8
   // CHECK: store x86_fp80 %[[LD]], ptr %[[LD_ADDR]], align 16
   // CHECK: %[[V0:.*]] = load ptr, ptr %[[BUF_ADDR]], align 8
   // CHECK: %[[V1:.*]] = load x86_fp80, ptr %[[LD_ADDR]], align 16
   // CHECK: %[[V2:.*]] = bitcast x86_fp80 %[[V1]] to i80
   // CHECK: %[[V3:.*]] = zext i80 %[[V2]] to i128
-  // CHECK: store i128 %[[V3]], ptr %[[COERCE]], align 16
-  // CHECK: %[[V5:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 0
-  // CHECK: %[[V6:.*]] = load i64, ptr %[[V5]], align 16
-  // CHECK: %[[V7:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 1
-  // CHECK: %[[V8:.*]] = load i64, ptr %[[V7]], align 8
-  // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i64 noundef %[[V6]], i64 noundef %[[V8]])
+  // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i128 noundef %[[V3]])
 
   __builtin_os_log_format(buf, "%Lf", ld);
 }
 
 // CHECK-LABEL: define linkonce_odr hidden void @__os_log_helper_1_0_1_16_0
-// CHECK: (ptr noundef %[[BUFFER:.*]], i64 noundef %[[ARG0_COERCE0:.*]], i64 noundef %[[ARG0_COERCE1:.*]])
+// CHECK: (ptr noundef %[[BUFFER:.*]], i128 noundef %[[ARG0:.*]])
 
-// CHECK: %[[ARG0:.*]] = alloca i128, align 16
 // CHECK: %[[BUFFER_ADDR:.*]] = alloca ptr, align 8
 // CHECK: %[[ARG0_ADDR:.*]] = alloca i128, align 16
-// CHECK: %[[V1:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 0
-// CHECK: store i64 %[[ARG0_COERCE0]], ptr %[[V1]], align 16
-// CHECK: %[[V2:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 1
-// CHECK: store i64 %[[ARG0_COERCE1]], ptr %[[V2]], align 8
-// CHECK: %[[ARG01:.*]] = load i128, ptr %[[ARG0]], align 16
 // CHECK: store ptr %[[BUFFER]], ptr %[[BUFFER_ADDR]], align 8
-// CHECK: store i128 %[[ARG01]], ptr %[[ARG0_ADDR]], align 16
+// CHECK: store i128 %[[ARG0]], ptr %[[ARG0_ADDR]], align 16
 // CHECK: %[[BUF:.*]] = load ptr, ptr %[[BUFFER_ADDR]], align 8
 // CHECK: %[[SUMMARY:.*]] = getelementptr i8, ptr %[[BUF]], i64 0
 // CHECK: store i8 0, ptr %[[SUMMARY]], align 1
diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c
index 55d4c438eedad..fdca4012ee4a4 100644
--- a/clang/test/CodeGen/ext-int-cc.c
+++ b/clang/test/CodeGen/ext-int-cc.c
@@ -32,11 +32,10 @@
 
 // Make sure 128 and 64 bit versions are passed like integers.
 void ParamPassing(_BitInt(128) b, _BitInt(64) c) {}
-// LIN64: define{{.*}} void @ParamPassing(i64 %{{.+}}, i64 %{{.+}}, i64 %{{.+}})
+// LIN64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // WIN64: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
 // LIN32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
 // WIN32: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
-// NACL: define{{.*}} void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}})
 // NVPTX64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // NVPTX: define{{.*}} void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}})
 // SPARCV9: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
@@ -67,7 +66,6 @@ void ParamPassing2(_BitInt(127) b, _BitInt(63) c) {}
 // WIN64: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
 // LIN32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
 // WIN32: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}})
-// NACL: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}})
 // NVPTX64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}})
 // NVPTX: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}})
 // SPARCV9: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}})
@@ -99,7 +97,6 @@ void ParamPassing3(_BitInt(15) a, _BitInt(31) b) {}
 // WIN64: define dso_local void @ParamPassing3(i15 %{{.+}}, i31 %{{.+}})
 // LIN32: define{{.*}} void @ParamPassing3(i15 signext %{{.+}}, i31 signext %{{.+}})
 // WIN32: define dso_local void @ParamPassing3(i15 signext %{{.+}}, i31 signext %{{.+}})
-// NACL: define{{.*}} void @ParamPassing3(i15 %{{.+}}, i31 %{{.+}})
 // NVPTX64: define{{.*}} void @ParamPassing3(i15 signext %{{.+}}, i31 signext %{{.+}})
 // NVPTX: define{{.*}} void @ParamPassing3(i15 signext %{{.+}}, i31 signext %{{.+}})
 // SPARCV9: define{{.*}} void @ParamPassing3(i15 signext %{{.+}}, i31 signext %{{.+}})
@@ -136,7 +133,6 @@ void ParamPassing4(_BitInt(129) a) {}
 // LIN32: define{{.*}} void @ParamPassing4(ptr %{{.+}})
 // WIN32: define dso_local void @ParamPassing4(ptr %{{.+}})
 // AARCH64: define{{.*}} void @ParamPassing4(ptr %{{.+}})
-// NACL-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
 // NVPTX64-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
 // NVPTX-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}})
 // SPARCV9-NOT: define{{.*}} void @ParamPassing4(ptr %{{.+}})
@@ -167,7 +163,6 @@ _BitInt(63) ReturnPassing(void) { return 0; }
 // WIN64: define dso_local i63 @ReturnPassing(
 // LIN32: define{{.*}} i63 @ReturnPassing(
 // WIN32: define dso_local i63 @ReturnPassing(
-// NACL: define{{.*}} i63 @ReturnPassing(
 // NVPTX64: define{{.*}} i63 @ReturnPassing(
 // NVPTX: define{{.*}} i63 @ReturnPassing(
 // SPARCV9: define{{.*}} signext i63 @ReturnPassing(
@@ -198,7 +193,6 @@ _BitInt(64) ReturnPassing2(void) { return 0; }
 // WIN64: define dso_local i64 @ReturnPassing2(
 // LIN32: define{{.*}} i64 @ReturnPassing2(
 // WIN32: define dso_local i64 @ReturnPassing2(
-// NACL: define{{.*}} i64 @ReturnPassing2(
 // NVPTX64: define{{.*}} i64 @ReturnPassing2(
 // NVPTX: define{{.*}} i64 @ReturnPassing2(
 // SPARCV9: define{{.*}} i64 @ReturnPassing2(
@@ -229,7 +223,6 @@ _BitInt(127) ReturnPassing3(void) { return 0; }
 // WIN64: define dso_local void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 // LIN32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 // WIN32: define dso_local void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
-// NACL: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 // NVPTX/64 makes the intentional choice to put all return values direct, even
 // large structures, so we do the same here.
 // NVPTX64: define{{.*}} i127 @ReturnPassing3(
@@ -258,11 +251,10 @@ _BitInt(127) ReturnPassing3(void) { return 0; }
 // LA32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 
 _BitInt(128) ReturnPassing4(void) { return 0; }
-// LIN64: define{{.*}} { i64, i64 } @ReturnPassing4(
+// LIN64: define{{.*}} i128 @ReturnPassing4(
 // WIN64: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // LIN32: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // WIN32: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
-// NACL: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // NVPTX64: define{{.*}} i128 @ReturnPassing4(
 // NVPTX: define{{.*}} i128 @ReturnPassing4(
 // SPARCV9: define{{.*}} i128 @ReturnPassing4(
@@ -295,7 +287,6 @@ _BitInt(129) ReturnPassing5(void) { return 0; }
 // LIN32: define{{.*}} void @ReturnPassing5(ptr dead_on_unwind noalias writable sret
 // WIN32: define dso_local void @ReturnPassing5(ptr dead_on_unwind noalias writable sret
 // AARCH64: define{{.*}} void @ReturnPassing5(ptr dead_on_unwind noalias writable sret
-// NACL-NOT: define{{.*}} void @ReturnPassing5(ptr dead_on_unwind noalias writable sret
 // NVPTX64-NOT: define{{.*}} i129 @ReturnPassing5(
 // NVPTX-NOT: define{{.*}} i129 @ReturnPassing5(
 // SPARCV9-NOT: define{{.*}} i129 @ReturnPassing5(
diff --git a/clang/test/CodeGen/extend-arg-64.c b/clang/test/CodeGen/extend-arg-64.c
index 2cb56d35af21d..8b99c01807ecc 100644
--- a/clang/test/CodeGen/extend-arg-64.c
+++ b/clang/test/CodeGen/extend-arg-64.c
@@ -84,7 +84,7 @@ int test(void) {
 #ifdef D128
   knr(i128);
   // CHECKEXT: load i128
-  // CHECKEXT: call{{.*}} void (i64, i64, ...) @knr
+  // CHECKEXT: call{{.*}} void (i128, ...) @knr
 #endif
 
   knr(u32, s32, u16, s16, u8, s8);
diff --git a/clang/test/CodeGen/long_double_fp128.cpp b/clang/test/CodeGen/long_double_fp128.cpp
index 01c81f5d71502..a66e5c5be585f 100644
--- a/clang/test/CodeGen/long_double_fp128.cpp
+++ b/clang/test/CodeGen/long_double_fp128.cpp
@@ -10,12 +10,9 @@
 // RUN:    | FileCheck %s --check-prefix=G32
 // RUN: %clang_cc1 -triple powerpc-linux-gnu -emit-llvm -o - %s \
 // RUN:    | FileCheck %s --check-prefix=P32
-// RUN: %clang_cc1 -triple x86_64-nacl -emit-llvm -o - %s \
-// RUN:    | FileCheck %s --check-prefix=N64
 
 // Check mangled name of long double.
 // Android's gcc and llvm use fp128 for long double.
-// NaCl uses double format for long double, but still has separate overloads.
 void test(long, float, double, long double, long double _Complex) { }
 // A64:  define{{.*}} void @_Z4testlfdgCg(i64 noundef %0, float noundef %1, double noundef %2, fp128 noundef %3, ptr
 // G64:  define{{.*}} void @_Z4testlfdeCe(i64 noundef %0, float noundef %1, double noundef %2, x86_fp80 noundef %3, ptr
@@ -23,4 +20,3 @@ void test(long, float, double, long double, long double _Complex) { }
 // A32:  define{{.*}} void @_Z4testlfdeCe(i32 noundef %0, float noundef %1, double noundef %2, double noundef %3, ptr
 // G32:  define{{.*}} void @_Z4testlfdeCe(i32 noundef %0, float noundef %1, double noundef %2, x86_fp80 noundef %3, ptr
 // P32:  define{{.*}} void @_Z4testlfdgCg(i32 noundef %0, float noundef %1, double noundef %2, ppc_fp128 noundef %3, ptr
-// N64: define{{.*}} void @_Z4testlfdeCe(i32 noundef %0, float noundef %1, double noundef %2, double noundef %3, double noundef {{.*}}, double
diff --git a/clang/test/CodeGen/malign-double-x86-nacl.c b/clang/test/CodeGen/malign-double-x86-nacl.c
deleted file mode 100644
index a415a46221f8b..0000000000000
--- a/clang/test/CodeGen/malign-double-x86-nacl.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -triple=i686-unknown-nacl | FileCheck %s
-// Check that i686-nacl essentially has -malign-double, which aligns
-// double, long double, and long long to 64-bits.
-
-int checksize[sizeof(long double) == 8 ? 1 : -1];
-int checkalign[__alignof(long double) == 8 ? 1 : -1];
-
-// CHECK-LABEL: define{{.*}} void @s1(double noundef %a)
-void s1(long double a) {}
-
-struct st_ld {
-  char c;
-  long double ld;
-};
-int checksize2[sizeof(struct st_ld) == 16 ? 1 : -1];
-int checkalign2[__alignof(struct st_ld) == 8 ? 1 : -1];
-
-int checksize3[sizeof(double) == 8 ? 1 : -1];
-int checkalign3[__alignof(double) == 8 ? 1 : -1];
-
-// CHECK-LABEL: define{{.*}} void @s2(double noundef %a)
-void s2(double a) {}
-
-struct st_d {
-  char c;
-  double d;
-};
-int checksize4[sizeof(struct st_d) == 16 ? 1 : -1];
-int checkalign4[__alignof(struct st_d) == 8 ? 1 : -1];
-
-
-int checksize5[sizeof(long long) == 8 ? 1 : -1];
-int checkalign5[__alignof(long long) == 8 ? 1 : -1];
-
-// CHECK-LABEL: define{{.*}} void @s3(i64 noundef %a)
-void s3(long long a) {}
-
-struct st_ll {
-  char c;
-  long long ll;
-};
-int checksize6[sizeof(struct st_ll) == 16 ? 1 : -1];
-int checkalign6[__alignof(struct st_ll) == 8 ? 1 : -1];
diff --git a/clang/test/CodeGen/new-pass-manager-opt-bisect.c b/clang/test/CodeGen/new-pass-manager-opt-bisect.c
index 91a0adf252bb5..5d5fdd473422a 100644
--- a/clang/test/CodeGen/new-pass-manager-opt-bisect.c
+++ b/clang/test/CodeGen/new-pass-manager-opt-bisect.c
@@ -7,6 +7,6 @@
 // CHECK: BISECT: running pass (1)
 // CHECK-NOT: BISECT: running pass (1)
 // Make sure that legacy pass manager is running
-// CHECK: Instruction Selection
+// CHECK: -isel
 
 int func(int a) { return a; }
diff --git a/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp b/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp
new file mode 100644
index 0000000000000..0b62f24177bbd
--- /dev/null
+++ b/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp
@@ -0,0 +1,5 @@
+// RUN: %clangxx -g -fsanitize=null -fsanitize-trap=all -fsanitize-annotate-debug-info=all -O2 -std=c++17 -c -o /dev/null %s
+
+struct foo {
+  foo(int, long, const int & = int());
+} foo(0, 0);
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 41a3f59b0fc81..92fe3eb6f171c 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -90,22 +90,6 @@
 // RUN: FileCheck %s -check-prefix=PS3
 // PS3: target datalayout = "E-m:e-p:32:32-Fi64-i64:64-i128:128-n32:64"
 
-// RUN: %clang_cc1 -triple i686-nacl -o - -emit-llvm %s | \
-// RUN: FileCheck %s -check-prefix=I686-NACL
-// I686-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n8:16:32-S128"
-
-// RUN: %clang_cc1 -triple x86_64-nacl -o - -emit-llvm %s | \
-// RUN: FileCheck %s -check-prefix=X86_64-NACL
-// X86_64-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n8:16:32:64-S128"
-
-// RUN: %clang_cc1 -triple arm-nacl -o - -emit-llvm %s | \
-// RUN: FileCheck %s -check-prefix=ARM-NACL
-// ARM-NACL: target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S128"
-
-// RUN: %clang_cc1 -triple mipsel-nacl -o - -emit-llvm %s | \
-// RUN: FileCheck %s -check-prefix=MIPS-NACL
-// MIPS-NACL: target datalayout = "e-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
-
 // RUN: %clang_cc1 -triple wasm32-unknown-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=WEBASSEMBLY32
 // WEBASSEMBLY32: target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-ni:1:10:20"
diff --git a/clang/test/CodeGenCXX/pragma-loop.cpp b/clang/test/CodeGenCXX/pragma-loop.cpp
index 4857299f1c037..8cb3346247daf 100644
--- a/clang/test/CodeGenCXX/pragma-loop.cpp
+++ b/clang/test/CodeGenCXX/pragma-loop.cpp
@@ -203,6 +203,43 @@ void for_test_scalable_1(int *List, int Length) {
   }
 }
 
+// Verify for loop is not performing vectorization
+void for_test_width_1(int *List, int Length) {
+#pragma clang loop vectorize_width(1) interleave_count(4) unroll(disable) distribute(disable)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_20:.*]]
+    List[i] = i * 2;
+  }
+}
+
+// Verify for loop is not performing vectorization
+void for_test_fixed_1(int *List, int Length) {
+#pragma clang loop vectorize_width(1, fixed) interleave_count(4) unroll(disable) distribute(disable)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_21:.*]]
+    List[i] = i * 2;
+  }
+}
+
+
+// Verify unroll attributes are directly attached to the loop metadata
+void for_test_vectorize_disable_unroll(int *List, int Length) {
+#pragma clang loop vectorize(disable) unroll_count(8)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_22:.*]]
+    List[i] = i * 2;
+  }
+}
+
+// Verify unroll attributes are directly attached to the loop metadata
+void for_test_interleave_vectorize_disable_unroll(int *List, int Length) {
+#pragma clang loop vectorize(disable) interleave_count(4) unroll_count(8)
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_23:.*]]
+    List[i] = i * 2;
+  }
+}
+
 // CHECK-DAG: ![[MP:[0-9]+]] = !{!"llvm.loop.mustprogress"}
 
 // CHECK-DAG: ![[UNROLL_DISABLE:[0-9]+]] = !{!"llvm.loop.unroll.disable"}
@@ -270,3 +307,7 @@ void for_test_scalable_1(int *List, int Length) {
 // CHECK-DAG: ![[LOOP_17]] = distinct !{![[LOOP_17]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
 // CHECK-DAG: ![[LOOP_18]] = distinct !{![[LOOP_18]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
 // CHECK-DAG: ![[LOOP_19]] = distinct !{![[LOOP_19]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]}
+// CHECK-DAG: ![[LOOP_20]] = distinct !{![[LOOP_20]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]}
+// CHECK-DAG: ![[LOOP_21]] = distinct !{![[LOOP_21]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]}
+// CHECK-DAG: ![[LOOP_22]] = distinct !{![[LOOP_22]], ![[MP]], ![[WIDTH_1]], ![[ISVECTORIZED]], ![[UNROLL_8]]}
+// CHECK-DAG: ![[LOOP_23]] = distinct !{![[LOOP_23]], ![[MP]], ![[WIDTH_1]], ![[INTERLEAVE_4]], ![[ISVECTORIZED]], ![[UNROLL_8]]}
diff --git a/clang/test/CodeGenCXX/x86_64-arguments-nacl-x32.cpp b/clang/test/CodeGenCXX/x86_64-arguments-nacl-x32.cpp
deleted file mode 100644
index f108e528269d6..0000000000000
--- a/clang/test/CodeGenCXX/x86_64-arguments-nacl-x32.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-nacl -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -std=c++11 -triple=x86_64-unknown-linux-gnux32 -emit-llvm -o - %s | FileCheck %s
-
-struct test_struct {};
-typedef int test_struct::* test_struct_mdp;
-typedef int (test_struct::*test_struct_mfp)();
-
-// CHECK-LABEL: define{{.*}} i32 @{{.*}}f_mdp{{.*}}(i32 %a)
-test_struct_mdp f_mdp(test_struct_mdp a) { return a; }
-
-// CHECK-LABEL: define {{.*}} @{{.*}}f_mfp{{.*}}(i64 %a.coerce)
-test_struct_mfp f_mfp(test_struct_mfp a) { return a; }
-
-// A struct with <= 12 bytes before a member data pointer should still
-// be allowed in registers, since the member data pointer is only 4 bytes.
-// CHECK-LABEL: define{{.*}} void @{{.*}}f_struct_with_mdp{{.*}}(i64 %a.coerce0, i64 %a.coerce1)
-struct struct_with_mdp { char *a; char *b; char *c; test_struct_mdp d; };
-void f_struct_with_mdp(struct_with_mdp a) { (void)a; }
-
-struct struct_with_mdp_too_much {
-  char *a; char *b; char *c; char *d; test_struct_mdp e;
-};
-// CHECK-LABEL: define{{.*}} void @{{.*}}f_struct_with_mdp_too_much{{.*}}({{.*}} byval({{.*}} {{.*}} %a)
-void f_struct_with_mdp_too_much(struct_with_mdp_too_much a) {
-  (void)a;
-}
-
-// A struct with <= 8 bytes before a member function pointer should still
-// be allowed in registers, since the member function pointer is only 8 bytes.
-// CHECK-LABEL: define{{.*}} void @{{.*}}f_struct_with_mfp_0{{.*}}(i64 %a.coerce0, i32 %a.coerce1)
-struct struct_with_mfp_0 { char *a; test_struct_mfp b; };
-void f_struct_with_mfp_0(struct_with_mfp_0 a) { (void)a; }
-
-// CHECK-LABEL: define{{.*}} void @{{.*}}f_struct_with_mfp_1{{.*}}(i64 %a.coerce0, i64 %a.coerce1)
-struct struct_with_mfp_1 { char *a; char *b; test_struct_mfp c; };
-void f_struct_with_mfp_1(struct_with_mfp_1 a) { (void)a; }
-
-// CHECK-LABEL: define{{.*}} void @{{.*}}f_struct_with_mfp_too_much{{.*}}({{.*}} byval({{.*}}) {{.*}} %a, i32 noundef %x)
-struct struct_with_mfp_too_much {
-  char *a; char *b; char *c; test_struct_mfp d;
-};
-void f_struct_with_mfp_too_much(struct_with_mfp_too_much a, int x) {
-  (void)a;
-}
-
-/* Struct containing an empty struct */
-typedef struct { int* a; test_struct x; double *b; } struct_with_empty;
-
-// CHECK-LABEL: define{{.*}} void @{{.*}}f_pass_struct_with_empty{{.*}}(i64 %x{{.*}}, ptr %x
-void f_pass_struct_with_empty(struct_with_empty x) {
-  (void) x;
-}
-
-// CHECK-LABEL: define{{.*}} { i64, ptr } @{{.*}}f_return_struct_with_empty
-struct_with_empty f_return_struct_with_empty() {
-  return {0, {}, 0};
-}
diff --git a/clang/test/CodeGenCoroutines/coro-gro.cpp b/clang/test/CodeGenCoroutines/coro-gro.cpp
index b62134317cef2..037fd03349e76 100644
--- a/clang/test/CodeGenCoroutines/coro-gro.cpp
+++ b/clang/test/CodeGenCoroutines/coro-gro.cpp
@@ -106,4 +106,31 @@ invoker g() {
   // CHECK: call void @_ZN7invoker15invoker_promise17get_return_objectEv({{.*}} %[[AggRes]]
   co_return;
 }
-// CHECK: ![[OutFrameMetadata]] = !{}
\ No newline at end of file
+
+namespace gh148953 {
+
+struct Task {
+  struct promise_type {
+    Task get_return_object();
+    std::suspend_always initial_suspend() { return {}; }
+    std::suspend_always final_suspend() noexcept { return {}; }
+    void return_void() {}
+    void unhandled_exception() {}
+  };
+  Task() {}
+  // Different from `invoker`, this Task is copy constructible.
+  Task(const Task&) {};
+};
+
+// NRVO on const qualified return type should work.
+// CHECK: define{{.*}} void @_ZN8gh1489537exampleEv({{.*}} sret(%"struct.gh148953::Task") align 1 %[[NrvoRes:.+]])
+const Task example() {
+  // CHECK: %[[ResultPtr:.+]] = alloca ptr
+  // CHECK: store ptr %[[NrvoRes]], ptr %[[ResultPtr]]
+  // CHECK: coro.init:
+  // CHECK: call void @_ZN8gh1489534Task12promise_type17get_return_objectEv({{.*}} %[[NrvoRes:.+]], {{.*}})
+  co_return;
+}
+
+} // namespace gh148953
+// CHECK: ![[OutFrameMetadata]] = !{}
diff --git a/clang/test/CodeGenHLSL/builtins/refract.hlsl b/clang/test/CodeGenHLSL/builtins/refract.hlsl
new file mode 100644
index 0000000000000..eda256451ee2b
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/refract.hlsl
@@ -0,0 +1,244 @@
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh(
+// CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half 0xH3C00, [[MUL2_I]]
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[MUL1_I]], [[SUB_I]]
+// CHECK:    [[SUB4_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half 0xH3C00, [[MUL3_I]]
+// CHECK:    [[MUL5_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL6_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[TMP0:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.sqrt.f16(half %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn half [[MUL6_I]], %{{.*}}
+// CHECK:    [[MUL7_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB8_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half %{{.*}}, [[MUL7_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CMP_I]], half 0xH0000, half %{{.*}}
+// CHECK:    ret half [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh(
+// SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] 
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.refract.f16.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret half [[SPV_REFRACT_I]]
+//
+half test_refract_half(half I, half N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z18test_refract_half2Dv2_DhS_Dh(
+// CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> splat (half 0xH3C00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> splat (half 0xH3C00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sqrt.v2f16(<2 x half> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <2 x half> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <2 x half> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <2 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <2 x half> zeroinitializer, <2 x half> %{{.*}}
+// CHECK:    ret <2 x half> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_refract_half2Dv2_DhS_Dh(
+// SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.refract.v2f16.f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret <2 x half> [[SPV_REFRACT_I]]
+//
+half2 test_refract_half2(half2 I, half2 N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z18test_refract_half3Dv3_DhS_Dh(
+// CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> splat (half 0xH3C00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> splat (half 0xH3C00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sqrt.v3f16(<3 x half> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <3 x half> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <3 x half> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <3 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <3 x half> zeroinitializer, <3 x half> %{{.*}}
+// CHECK:    ret <3 x half> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_refract_half3Dv3_DhS_Dh(
+// SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.refract.v3f16.f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret <3 x half> [[SPV_REFRACT_I]]
+//
+half3 test_refract_half3(half3 I, half3 N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18test_refract_half4Dv4_DhS_Dh(
+// CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> splat (half 0xH3C00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> splat (half 0xH3C00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sqrt.v4f16(<4 x half> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <4 x half> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <4 x half> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <4 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <4 x half> zeroinitializer, <4 x half> %{{.*}}
+// CHECK:    ret <4 x half> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_refract_half4Dv4_DhS_Dh(
+// SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, half %{{.*}})
+// SPVCHECK:    ret <4 x half> [[SPV_REFRACT_I]]
+//
+half4 test_refract_half4(half4 I, half4 N, half ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_refract_floatfff(
+// CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float 1.000000e+00, [[MUL2_I]]
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[MUL1_I]], [[SUB_I]]
+// CHECK:    [[SUB4_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float 1.000000e+00, [[MUL3_I]]
+// CHECK:    [[MUL5_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL6_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(float %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn float [[MUL6_I]], %{{.*}}
+// CHECK:    [[MUL7_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB8_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float %{{.*}}, [[MUL7_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float %{{.*}}, 0.000000e+00
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CMP_I]], float 0.000000e+00, float %{{.*}}
+// CHECK:    ret float [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_refract_floatfff(
+// SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.refract.f32.f32(float %{{.*}}, float  %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret float [[SPV_REFRACT_I]]
+//
+float test_refract_float(float I, float N, float ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z19test_refract_float2Dv2_fS_f(
+// CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <2 x float> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <2 x float> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <2 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <2 x float> zeroinitializer, <2 x float> %{{.*}}
+// CHECK:    ret <2 x float> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_refract_float2Dv2_fS_f(
+// SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.refract.v2f32.f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret <2 x float> [[SPV_REFRACT_I]]
+//
+float2 test_refract_float2(float2 I, float2 N, float ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z19test_refract_float3Dv3_fS_f(
+// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <3 x float> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <3 x float> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <3 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <3 x float> zeroinitializer, <3 x float> %{{.*}}
+// CHECK:    ret <3 x float> [[HLSL_SELECT_I]]
+//
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_refract_float3Dv3_fS_f(
+// SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.refract.v3f32.f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret <3 x float> [[SPV_REFRACT_I]]
+//
+float3 test_refract_float3(float3 I, float3 N, float ETA) {
+    return refract(I, N, ETA);
+}
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z19test_refract_float4Dv4_fS_f
+// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+// CHECK:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}}
+// CHECK:    [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), [[MUL3_I]]
+// CHECK:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, [[SUB_I]]
+// CHECK:    [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), [[MUL4_I]]
+// CHECK:    [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}}
+// CHECK:    [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.*}})
+// CHECK:    [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <4 x float> [[MUL11_I]], [[TMP17]]
+// CHECK:    [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[ADD_I]], %{{.*}}
+// CHECK:    [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[MUL8_I]], [[MUL12_I]]
+// CHECK:    [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt  <4 x float> %{{.*}}, zeroinitializer
+// CHECK:    [[CAST:%.*]] = extractelement <4 x i1> [[CMP_I]], i32 0
+// CHECK:    [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <4 x float> zeroinitializer, <4 x float> %{{.*}}
+// CHECK:    ret <4 x float> [[HLSL_SELECT_I]]
+
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_refract_float4Dv4_fS_f(
+// SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) %{{.*}}, <4 x float> noundef nofpclass(nan inf) %{{.*}}, float noundef nofpclass(nan inf) %{{.*}}) #[[ATTR0:[0-9]+]] {
+// SPVCHECK:  [[ENTRY:.*:]]
+// SPVCHECK:    [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, float %{{.*}})
+// SPVCHECK:    ret <4 x float> [[SPV_REFRACT_I]]
+//
+float4 test_refract_float4(float4 I, float4 N, float ETA) {
+    return refract(I, N, ETA);
+}
diff --git a/clang/test/CodeGenObjC/arc.m b/clang/test/CodeGenObjC/arc.m
index 6dbd5057314e4..57afe9c245e77 100644
--- a/clang/test/CodeGenObjC/arc.m
+++ b/clang/test/CodeGenObjC/arc.m
@@ -1371,13 +1371,13 @@ void test71(void) {
 // CHECK: %[[T:.*]] = alloca [2 x ptr], align 16
 // CHECK: %[[V0:.*]] = call ptr @llvm.objc.retain(ptr %[[A]])
 // CHECK: %[[V1:.*]] = call ptr @llvm.objc.retain(ptr %[[B]]) #2
-// CHECK: %[[V3:.*]] = load ptr, ptr %[[A_ADDR]], align 8, !tbaa !7
+// CHECK: %[[V3:.*]] = load ptr, ptr %[[A_ADDR]], align 8, !tbaa !{{[0-9]+}}
 // CHECK: %[[V4:.*]] = call ptr @llvm.objc.retain(ptr %[[V3]]) #2
-// CHECK: store ptr %[[V4]], ptr %[[T]], align 8, !tbaa !7
+// CHECK: store ptr %[[V4]], ptr %[[T]], align 8, !tbaa !{{[0-9]+}}
 // CHECK: %[[ARRAYINIT_ELEMENT:.*]] = getelementptr inbounds ptr, ptr %[[T]], i64 1
-// CHECK: %[[V5:.*]] = load ptr, ptr %[[B_ADDR]], align 8, !tbaa !7
+// CHECK: %[[V5:.*]] = load ptr, ptr %[[B_ADDR]], align 8, !tbaa !{{[0-9]+}}
 // CHECK: %[[V6:.*]] = call ptr @llvm.objc.retain(ptr %[[V5]]) #2
-// CHECK: store ptr %[[V6]], ptr %[[ARRAYINIT_ELEMENT]], align 8, !tbaa !7
+// CHECK: store ptr %[[V6]], ptr %[[ARRAYINIT_ELEMENT]], align 8, !tbaa !{{[0-9]+}}
 // CHECK: %[[ARRAY_BEGIN:.*]] = getelementptr inbounds [2 x ptr], ptr %[[T]], i32 0, i32 0
 // CHECK: %[[V7:.*]] = getelementptr inbounds ptr, ptr %[[ARRAY_BEGIN]], i64 2
 
diff --git a/clang/test/CodeGenObjC/gnustep2-class-exts.m b/clang/test/CodeGenObjC/gnustep2-class-exts.m
new file mode 100644
index 0000000000000..b065490a7d8d8
--- /dev/null
+++ b/clang/test/CodeGenObjC/gnustep2-class-exts.m
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fobjc-runtime=gnustep-2.2 -emit-llvm -o - %s | FileCheck %s
+
+@protocol BaseProtocol
+@end
+
+@protocol ExtendedProtocol
+@end
+
+@interface TestClass <BaseProtocol>
+
+-(void) Meth;
+@end
+
+@interface TestClass () <BaseProtocol, ExtendedProtocol>
+@end
+
+@implementation TestClass
+@end
+
+// Check that we emit metadata for both protocols
+// CHECK: @._OBJC_PROTOCOL_ExtendedProtocol = global
+// CHECK: @._OBJC_PROTOCOL_BaseProtocol = global
+
+// Check that we deduplicate the protocol list
+// CHECK: @.objc_protocol_list{{\.[0-9]*}} = internal global { ptr, i64, [2 x ptr] }
diff --git a/clang/test/CodeGenObjC/matrix-type-operators.m b/clang/test/CodeGenObjC/matrix-type-operators.m
index 179bb180daf90..ddd6c5f003f0a 100644
--- a/clang/test/CodeGenObjC/matrix-type-operators.m
+++ b/clang/test/CodeGenObjC/matrix-type-operators.m
@@ -13,11 +13,11 @@ @interface IntValue
 // CHECK-LABEL: @test_index_placeholders(
 // CHECK-NEXT:  entry:
 // CHECK:         [[IV:%.*]] = load ptr, ptr [[IV_ADDR:%.*]], align 8
-// CHECK-NEXT:    [[SEL:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7
+// CHECK-NEXT:    [[SEL:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !{{[0-9]+}}
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @objc_msgSend(ptr noundef [[IV]], ptr noundef [[SEL]])
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[CALL]] to i64
 // CHECK-NEXT:    [[IV2:%.*]] = load ptr, ptr [[IV_ADDR]], align 8
-// CHECK-NEXT:    [[SEL2:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7
+// CHECK-NEXT:    [[SEL2:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !{{[0-9]+}}
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @objc_msgSend(ptr noundef [[IV2]], ptr noundef [[SEL2]])
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[CALL1]] to i64
 // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[CONV2]], 4
@@ -38,17 +38,17 @@ @interface MatrixValue
 
 // CHECK-LABEL: @test_base_and_index_placeholders(
 // CHECK:         [[IV:%.*]] = load ptr, ptr [[IV_ADDR:%.*]], align 8
-// CHECK-NEXT:    [[SEL:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7
+// CHECK-NEXT:    [[SEL:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !{{[0-9]+}}
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @objc_msgSend(ptr noundef [[IV]], ptr noundef [[SEL]])
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[CALL]] to i64
 // CHECK-NEXT:    [[IV2:%.*]] = load ptr, ptr [[IV_ADDR]], align 8
-// CHECK-NEXT:    [[SEL2:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7
+// CHECK-NEXT:    [[SEL2:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !{{[0-9]+}}
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @objc_msgSend(ptr noundef [[IV2]], ptr noundef [[SEL2]])
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[CALL1]] to i64
 // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[CONV2]], 4
 // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]]
 // CHECK-NEXT:    [[M:%.*]] = load ptr, ptr %m.addr, align 8
-// CHECK-NEXT:    [[SEL3:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7
+// CHECK-NEXT:    [[SEL3:%.*]] = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !{{[0-9]+}}
 // CHECK-NEXT:    [[MAT:%.*]] = call <16 x double> @objc_msgSend(ptr noundef [[M]], ptr noundef [[SEL3]])
 // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]]
 // CHECK-NEXT:    ret double [[MATEXT]]
diff --git a/clang/test/CodeGenObjC/ptrauth-attr-exception.m b/clang/test/CodeGenObjC/ptrauth-attr-exception.m
new file mode 100644
index 0000000000000..8c07a698ecc70
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-attr-exception.m
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple arm64e -fptrauth-calls -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination -emit-llvm -fexceptions -fobjc-exceptions -o - %s | FileCheck %s
+
+__attribute__((objc_root_class))
+@interface Root {
+  Class isa;
+}
+@end
+
+__attribute__((objc_exception))
+@interface A : Root
+@end
+
+@implementation A
+@end
+
+// CHECK: @"OBJC_EHTYPE_$_A" = global %struct._objc_typeinfo { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @objc_ehtype_vtable, i32 2), i32 2), ptr @OBJC_CLASS_NAME_, ptr @"OBJC_CLASS_$_A" }
+ //.      @"OBJC_EHTYPE_$_A" = global %struct._objc_typeinfo { ptr getelementptr inbounds (ptr, ptr @objc_ehtype_vtable, i32 2), ptr @OBJC_CLASS_NAME_, ptr @"OBJC_CLASS_$_A" }
diff --git a/clang/test/CodeGenObjC/ptrauth-block-isa.m b/clang/test/CodeGenObjC/ptrauth-block-isa.m
new file mode 100644
index 0000000000000..c1e98c6fd9d3e
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-block-isa.m
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -fptrauth-calls -fptrauth-objc-isa -fobjc-arc -fblocks -triple arm64e -emit-llvm %s  -o - | FileCheck %s
+
+void (^globalblock)(void) = ^{};
+// CHECK: [[GLOBAL_BLOCK:@.*]] = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr [[GLOBAL_BLOCK]]), i32 1342177280, i32 0, ptr @globalblock_block_invoke, ptr @"__block_descriptor_32_e5_v8\01?0l" }, align 8 #0
+
+@interface A
+- (int) count;
+@end
+
+void use_block(int (^)(void));
+
+// CHECK-LABEL: define dso_local void @test_block_literal(
+void test_block_literal(int i) {
+  // CHECK:      [[I:%.*]] = alloca i32,
+  // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align
+  // CHECK:      [[ISAPTRADDR:%.*]] = getelementptr inbounds nuw [[BLOCK_T]], ptr [[BLOCK]], i32 0, i32 0
+  // CHECK-NEXT: [[ISAPTRADDR_I:%.*]] = ptrtoint ptr [[ISAPTRADDR]] to i64
+  // CHECK-NEXT: [[ISADISCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[ISAPTRADDR_I]], i64 27361)
+  // CHECK-NEXT: [[SIGNEDISA:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @_NSConcreteStackBlock to i64), i32 2, i64 [[ISADISCRIMINATOR]])
+  // CHECK-NEXT: [[SIGNEDISAPTR:%.*]] = inttoptr i64 [[SIGNEDISA]] to ptr
+  // CHECK-NEXT: store ptr [[SIGNEDISAPTR]], ptr [[ISAPTRADDR]]
+  use_block(^{return i;});
+}
+
+void test_conversion_helper(id);
+
+// CHECK-LABEL: define dso_local void @test_conversion(
+void test_conversion(id a) {
+  // CHECK:      [[A:%.*addr]] = alloca ptr
+  // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align
+  // CHECK:      [[ISAPTRADDR:%.*]] = getelementptr inbounds nuw [[BLOCK_T]], ptr [[BLOCK]], i32 0, i32 0
+  // CHECK-NEXT: [[ISAPTRADDR_I:%.*]] = ptrtoint ptr [[ISAPTRADDR]] to i64
+  // CHECK-NEXT: [[ISADISCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[ISAPTRADDR_I]], i64 27361)
+  // CHECK-NEXT: [[SIGNEDISA:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @_NSConcreteStackBlock to i64), i32 2, i64 [[ISADISCRIMINATOR]])
+  // CHECK-NEXT: [[SIGNEDISAPTR:%.*]] = inttoptr i64 [[SIGNEDISA]] to ptr
+  // CHECK-NEXT: store ptr [[SIGNEDISAPTR]], ptr [[ISAPTRADDR]]
+  test_conversion_helper(^{
+    (void)a;
+  });
+}
diff --git a/clang/test/CodeGenObjC/ptrauth-class-ro.m b/clang/test/CodeGenObjC/ptrauth-class-ro.m
new file mode 100644
index 0000000000000..a80b3d34bf9f4
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-class-ro.m
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -Wno-objc-root-class -fptrauth-objc-class-ro -fobjc-arc -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: @"OBJC_CLASS_$_C" = global %struct._class_t { ptr @"OBJC_METACLASS_$_C", ptr null, ptr @_objc_empty_cache, ptr null, ptr ptrauth (ptr @"_OBJC_CLASS_RO_$_C", i32 2, i64 25080, ptr getelementptr inbounds (%struct._class_t, ptr @"OBJC_CLASS_$_C", i32 0, i32 4)) }, section "__DATA, __objc_data", align 8
+// CHECK: @"OBJC_METACLASS_$_C" = global %struct._class_t { ptr @"OBJC_METACLASS_$_C", ptr @"OBJC_CLASS_$_C", ptr @_objc_empty_cache, ptr null, ptr ptrauth (ptr @"_OBJC_METACLASS_RO_$_C", i32 2, i64 25080, ptr getelementptr inbounds (%struct._class_t, ptr @"OBJC_METACLASS_$_C", i32 0, i32 4)) }, section "__DATA, __objc_data", align 8
+// CHECK: @OBJC_CLASS_NAME_ = private unnamed_addr constant [2 x i8] c"C\00", section "__TEXT,__objc_classname,cstring_literals", align 1
+// CHECK: @"_OBJC_METACLASS_RO_$_C" = internal global %struct._class_ro_t { i32 131, i32 40, i32 40, ptr null, ptr @OBJC_CLASS_NAME_, ptr null, ptr null, ptr null, ptr null, ptr null }, section "__DATA, __objc_const", align 8
+// CHECK: @OBJC_METH_VAR_NAME_ = private unnamed_addr constant [3 x i8] c"m0\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+// CHECK: @OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [8 x i8] c"v16@0:8\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
+// CHECK: @"_OBJC_$_INSTANCE_METHODS_C" = internal global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { ptr @OBJC_METH_VAR_NAME_, ptr @OBJC_METH_VAR_TYPE_, ptr ptrauth (ptr @"\01-[C m0]", i32 0, i64 0, ptr getelementptr inbounds ({ i32, i32, [1 x %struct._objc_method] }, ptr @"_OBJC_$_INSTANCE_METHODS_C", i32 0, i32 2, i32 0, i32 2)) }] }, section "__DATA, __objc_const", align 8
+// CHECK: @"_OBJC_CLASS_RO_$_C" = internal global %struct._class_ro_t { i32 130, i32 0, i32 0, ptr null, ptr @OBJC_CLASS_NAME_, ptr ptrauth (ptr @"_OBJC_$_INSTANCE_METHODS_C", i32 2, i64 49936, ptr getelementptr inbounds (%struct._class_ro_t, ptr @"_OBJC_CLASS_RO_$_C", i32 0, i32 5)), ptr null, ptr null, ptr null, ptr null }, section "__DATA, __objc_const", align 8
+// CHECK: @OBJC_SELECTOR_REFERENCES_ = internal externally_initialized global ptr @OBJC_METH_VAR_NAME_, section "__DATA,__objc_selrefs,literal_pointers,no_dead_strip", align 8
+// CHECK: @"OBJC_LABEL_CLASS_$" = private global [1 x ptr] [ptr @"OBJC_CLASS_$_C"], section "__DATA,__objc_classlist,regular,no_dead_strip"
+
+@interface C
+- (void) m0;
+@end
+
+@implementation C
+- (void)m0 {}
+@end
+
+void test_sign_class_ro(C *c) {
+  [c m0];
+}
diff --git a/clang/test/CodeGenObjC/ptrauth-class.m b/clang/test/CodeGenObjC/ptrauth-class.m
new file mode 100644
index 0000000000000..c5ad03665f2a7
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-class.m
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -Wno-everything -fblocks -fptrauth-intrinsics -triple arm64-apple-ios -fobjc-runtime=ios-12.2 -emit-llvm -no-enable-noundef-analysis -fobjc-arc -O2 -disable-llvm-passes -o - %s | FileCheck %s
+
+#if __has_feature(ptrauth_objc_signable_class)
+struct TestStruct {
+  __ptrauth(2, 1, 1234) Class isa;
+};
+
+@interface TestClass {
+@public
+  __ptrauth(2, 1, 1234) Class isa;
+}
+@end
+
+struct TestConstStruct {
+  __ptrauth(2, 1, 1234) const Class isa;
+  __ptrauth(2, 1, 1234) volatile Class visa;
+};
+
+@interface TestConstClass {
+@public
+  __ptrauth(2, 1, 1234) const Class isa;
+  __ptrauth(2, 1, 1234) volatile Class visa;
+}
+@end
+
+// CHECK-LABEL: define void @setTestStructIsa(ptr %t, ptr %c) #0 {
+void setTestStructIsa(struct TestStruct *t, Class c) {
+  t->isa = c;
+  // CHECK: [[T_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: [[C_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: store ptr %c, ptr [[C_ADDR]], align 8
+  // CHECK: [[ISA_SLOT:%.*]] = getelementptr inbounds nuw %struct.TestStruct, ptr %0, i32 0, i32 0
+  // CHECK: [[C:%.*]] = load ptr, ptr %c.addr, align 8
+  // CHECK: [[CAST_ISA_SLOT:%.*]] = ptrtoint ptr [[ISA_SLOT]] to i64
+  // CHECK: [[BLENDED_VALUE:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ISA_SLOT]], i64 1234)
+  // CHECK: [[CAST_C:%.*]] = ptrtoint ptr [[C]] to i64
+  // CHECK: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[CAST_C]], i32 2, i64 [[BLENDED_VALUE]])
+}
+
+// CHECK-LABEL: define void @setTestClassIsa(ptr %t, ptr %c) #0 {
+void setTestClassIsa(TestClass *t, Class c) {
+  t->isa = c;
+  // CHECK: [[T_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: [[C_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: store ptr %c, ptr [[C_ADDR]], align 8
+  // CHECK: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+  // CHECK: [[IVAR_OFFSET32:%.*]] = load i32, ptr @"OBJC_IVAR_$_TestClass.isa", align 8
+  // CHECK: [[IVAR_OFFSET64:%.*]] = sext i32 [[IVAR_OFFSET32]] to i64
+  // CHECK: [[ADDED_PTR:%.*]] = getelementptr inbounds i8, ptr %1, i64 [[IVAR_OFFSET64]]
+  // CHECK: [[C_VALUE:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+  // CHECK: [[CAST_ISA_SLOT:%.*]] = ptrtoint ptr [[ADDED_PTR]] to i64
+  // CHECK: [[BLENDED_VALUE:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ISA_SLOT]], i64 1234)
+  // CHECK: [[CAST_C_VALUE:%.*]] = ptrtoint ptr [[C_VALUE]] to i64
+  // CHECK: [[SIGNED:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[CAST_C_VALUE]], i32 2, i64 [[BLENDED_VALUE]])
+}
+
+// CHECK-LABEL: define ptr @getTestStructIsa(ptr %t) #0 {
+Class getTestStructIsa(struct TestStruct *t) {
+  return t->isa;
+  // CHECK: [[T_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: [[T_VALUE:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+  // CHECK: [[ISA_SLOT:%.*]] = getelementptr inbounds nuw %struct.TestStruct, ptr [[T_VALUE]], i32 0, i32 0
+  // CHECK: [[ISA_VALUE:%.*]] = load ptr, ptr [[ISA_SLOT]], align 8
+  // CHECK: [[CAST_ISA_SLOT:%.*]] = ptrtoint ptr %isa to i64
+  // CHECK: [[BLENDED_VALUE:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ISA_SLOT]], i64 1234)
+  // CHECK: [[CAST_ISA_VALUE:%.*]] = ptrtoint ptr [[ISA_VALUE]] to i64
+  // CHECK: [[SIGNED_VALUE:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_ISA_VALUE]], i32 2, i64 [[BLENDED_VALUE]])
+}
+
+// CHECK-LABEL: define ptr @getTestClassIsa(ptr %t) #0 {
+Class getTestClassIsa(TestClass *t) {
+  return t->isa;
+  // CHECK: [[T_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+  // CHECK: [[IVAR:%.*]] = load i32, ptr @"OBJC_IVAR_$_TestClass.isa", align 8
+  // CHECK: [[IVAR_CONV:%.*]] = sext i32 [[IVAR]] to i64
+  // CHECK: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[T]], i64 [[IVAR_CONV]]
+  // CHECK: [[LOADED_VALUE:%.*]] = load ptr, ptr [[ADD_PTR]], align 8
+  // CHECK: [[INT_VALUE:%.*]] = ptrtoint ptr [[ADD_PTR]] to i64
+  // CHECK: [[BLENDED_VALUE:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[INT_VALUE]], i64 1234)
+  // CHECK: [[NULL_CHECK:%.*]] = icmp ne ptr [[LOADED_VALUE]], null
+  // CHECK: [[CAST_VALUE:%.*]] = ptrtoint ptr [[LOADED_VALUE]] to i64
+  // CHECK: [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VALUE]], i32 2, i64 [[BLENDED_VALUE]])
+}
+
+// Just enough to verify we do actually authenticate qualified Class
+// CHECK: define ptr @getTestConstClassIsa(ptr %t) #0 {
+Class getTestConstClassIsa(TestConstClass *t) {
+  return t->isa;
+  // CHECK: [[T_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: [[T:%.*]] = load ptr, ptr [[T_ADDR]], align 8
+  // CHECK: [[IVAR:%.*]] = load i32, ptr @"OBJC_IVAR_$_TestConstClass.isa", align 8
+  // CHECK: [[IVAR_CONV:%.*]] = sext i32 [[IVAR]] to i64
+  // CHECK: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[T]], i64 [[IVAR_CONV]]
+  // CHECK: [[LOADED_VALUE:%.*]] = load ptr, ptr [[ADD_PTR]], align 8
+  // CHECK: [[INT_VALUE:%.*]] = ptrtoint ptr [[ADD_PTR]] to i64
+  // CHECK: [[BLENDED_VALUE:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[INT_VALUE]], i64 1234)
+  // CHECK: [[NULL_CHECK:%.*]] = icmp ne ptr [[LOADED_VALUE]], null
+  // CHECK: [[CAST_VALUE:%.*]] = ptrtoint ptr [[LOADED_VALUE]] to i64
+  // CHECK: [[AUTHED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[CAST_VALUE]], i32 2, i64 [[BLENDED_VALUE]])
+}
+
+#endif
diff --git a/clang/test/CodeGenObjC/ptrauth-objc-interface-selector.m b/clang/test/CodeGenObjC/ptrauth-objc-interface-selector.m
new file mode 100644
index 0000000000000..15f7d03a0ba2d
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-objc-interface-selector.m
@@ -0,0 +1,130 @@
+// RUN: %clang_cc1 -O0 -Wno-objc-root-class -fptrauth-intrinsics -fptrauth-calls -nostdsysteminc -triple arm64-apple-ios -emit-llvm -fptrauth-objc-interface-sel -o - %s | FileCheck --check-prefix=CHECK-AUTHENTICATED-SEL %s
+// RUN: %clang_cc1 -O0 -Wno-objc-root-class -fptrauth-intrinsics -fptrauth-calls -nostdsysteminc -triple arm64-apple-ios -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-UNAUTHENTICATED-SEL %s
+
+#include <ptrauth.h>
+#define __ptrauth_objc_sel_override                 \
+  __ptrauth(ptrauth_key_objc_sel_pointer, 1, 22467)
+
+@interface Test {
+@public
+  SEL auto_sel;
+@public
+  const SEL const_auto_sel;
+@public
+  volatile SEL volatile_auto_sel;
+@public
+  SEL __ptrauth_objc_sel_override manual;
+@public
+  const SEL __ptrauth_objc_sel_override const_manual;
+@public
+  volatile SEL __ptrauth_objc_sel_override volatile_manual;
+@public
+
+  SEL __ptrauth_objc_sel_override _manual_sel_property;
+}
+
+@property SEL auto_sel_property;
+@property const SEL const_auto_sel_property;
+@property volatile SEL volatile_auto_sel_property;
+@property SEL manual_sel_property;
+
+@end
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define internal ptr @"\01-[Test test:]"
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 {{%.*}}, i32 3, i64 {{%.*}}, i32 3, i64 {{%.*}})
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = load volatile ptr, ptr {{%.*}}, align 8
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 {{%.*}}, i32 3, i64 {{%.*}}, i32 3, i64 {{%.*}})
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22467)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = ptrtoint ptr {{%.*}} to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22467)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = ptrtoint ptr {{%.*}} to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 {{%.*}}, i32 3, i64 {{%.*}}, i32 3, i64 {{%.*}})
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.auth(i64 {{%.*}}, i32 3, i64 {{%.*}})
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define internal ptr @"\01-[Test auto_sel_property]"
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.auth(i64 {{%.*}}, i32 3, i64 {{%.*}})
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define internal void @"\01-[Test setAuto_sel_property:]"
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.sign(i64 {{%.*}}, i32 3, i64 {{%.*}})
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define internal ptr @"\01-[Test const_auto_sel_property]"
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.auth(i64 {{%.*}}, i32 3, i64 {{%.*}})
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define internal void @"\01-[Test setConst_auto_sel_property:]"
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.sign(i64 {{%.*}}, i32 3, i64 {{%.*}})
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define internal ptr @"\01-[Test volatile_auto_sel_property]"
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.auth(i64 {{%.*}}, i32 3, i64 {{%.*}})
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define internal void @"\01-[Test setVolatile_auto_sel_property:]"
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.sign(i64 {{%.*}}, i32 3, i64 {{%.*}})
+
+@implementation Test
+- (SEL)test:(Test *)in {
+  _auto_sel_property = in->_auto_sel_property;
+  _volatile_auto_sel_property = in->_volatile_auto_sel_property;
+  _manual_sel_property = in->_manual_sel_property;
+  return _const_auto_sel_property;
+}
+@end
+
+void auto_sel(Test *out, Test *in) {
+  out->auto_sel = in->auto_sel;
+}
+// CHECK-AUTHENTICATED-SEL-LABEL: define void @auto_sel
+// CHECK-AUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
+
+// CHECK-UNAUTHENTICATED-SEL-LABEL: define void @auto_sel
+SEL const_auto_sel(Test *in) {
+  return in->const_auto_sel;
+}
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define ptr @const_auto_sel
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = ptrtoint ptr {{%.*}} to i64
+// CHECK-AUTHENTICATED-SEL: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 {{%.*}}, i32 3, i64 {{%.*}})
+// CHECK-AUTHENTICATED-SEL: [[RESULT:%.*]] = inttoptr i64 [[AUTHENTICATED]] to ptr
+
+void volatile_auto_sel(Test *out, Test *in) {
+  out->volatile_auto_sel = in->volatile_auto_sel;
+}
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define void @volatile_auto_sel
+// CHECK-AUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
+
+void manual(Test *out, Test *in) {
+  out->manual = in->manual;
+}
+
+// CHECK-AUTHENTICATED-SEL-LABEL: define void @manual
+// CHECK-AUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22467)
+// CHECK-AUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22467)
+// CHECK-AUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
+
+// CHECK-UNAUTHENTICATED-SEL-LABEL: define void @manual
+// CHECK-UNAUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22467)
+// CHECK-UNAUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-UNAUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22467)
+// CHECK-UNAUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-UNAUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
diff --git a/clang/test/CodeGenObjC/ptrauth-objc-isa-super.m b/clang/test/CodeGenObjC/ptrauth-objc-isa-super.m
new file mode 100644
index 0000000000000..69fd01f28067d
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-objc-isa-super.m
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -I %S/Inputs -fptrauth-calls -fptrauth-objc-isa -triple arm64-apple-ios -emit-llvm -no-enable-noundef-analysis -fblocks -fobjc-arc -fobjc-runtime-has-weak -O2 -disable-llvm-passes -o - %s | FileCheck %s
+
+#include "literal-support.h"
+
+#if __has_feature(objc_bool)
+#define YES __objc_yes
+#define NO __objc_no
+#else
+#define YES ((BOOL)1)
+#define NO ((BOOL)0)
+#endif
+
+@class NSString;
+
+// CHECK: @"OBJC_METACLASS_$_C" = global %struct._class_t { ptr ptrauth (ptr @"OBJC_METACLASS_$_Base", i32 2, i64 27361, ptr @"OBJC_METACLASS_$_C"), ptr ptrauth (ptr @"OBJC_METACLASS_$_Base", i32 2, i64 46507, ptr getelementptr inbounds (%struct._class_t, ptr @"OBJC_METACLASS_$_C", i32 0, i32 1)), ptr @_objc_empty_cache, ptr null, ptr @"_OBJC_METACLASS_RO_$_C" }
+// CHECK: @"OBJC_CLASSLIST_SUP_REFS_$_" = private global ptr @"OBJC_METACLASS_$_C"
+// CHECK: @OBJC_METH_VAR_NAME_ = private unnamed_addr constant [5 x i8] c"test\00"
+// CHECK: @OBJC_SELECTOR_REFERENCES_ = internal externally_initialized global ptr @OBJC_METH_VAR_NAME_
+// CHECK: @"OBJC_METACLASS_$_Base" = external global %struct._class_t
+// CHECK: @OBJC_CLASS_NAME_ = private unnamed_addr constant [2 x i8] c"C\00"
+// CHECK: @OBJC_METH_VAR_NAME_.1 = private unnamed_addr constant [11 x i8] c"super_test\00"
+// CHECK: @OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [8 x i8] c"v16@0:8\00"
+// CHECK: @"_OBJC_$_CLASS_METHODS_C" = internal global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { ptr @OBJC_METH_VAR_NAME_.1, ptr @OBJC_METH_VAR_TYPE_, ptr ptrauth (ptr @"\01+[C super_test]", i32 0, i64 0, ptr getelementptr inbounds ({ i32, i32, [1 x %struct._objc_method] }, ptr @"_OBJC_$_CLASS_METHODS_C", i32 0, i32 2, i32 0, i32 2)) }] }
+// CHECK: @"_OBJC_METACLASS_RO_$_C" = internal global %struct._class_ro_t { i32 129, i32 40, i32 40, ptr null, ptr @OBJC_CLASS_NAME_, ptr ptrauth (ptr @"_OBJC_$_CLASS_METHODS_C", i32 2, i64 49936, ptr getelementptr inbounds (%struct._class_ro_t, ptr @"_OBJC_METACLASS_RO_$_C", i32 0, i32 5)), ptr null, ptr null, ptr null, ptr null }
+// CHECK: @"OBJC_CLASS_$_Base" = external global %struct._class_t
+// CHECK: @"_OBJC_CLASS_RO_$_C" = internal global %struct._class_ro_t { i32 128, i32 0, i32 0, ptr null, ptr @OBJC_CLASS_NAME_, ptr null, ptr null, ptr null, ptr null, ptr null }
+//        @"_OBJC_CLASS_RO_$_C" = internal global %struct._class_ro_t { i32 128, i32 0, i32 0, ptr null, ptr @OBJC_CLASS_NAME_, ptr ptrauth (ptr null, i32 2, i64 49936, ptr getelementptr inbounds (%struct._class_ro_t, ptr @"_OBJC_CLASS_RO_$_C", i32 0, i32 5)), ptr null, ptr null, ptr null, ptr null }
+// CHECK: @"OBJC_CLASS_$_C" = global %struct._class_t { ptr ptrauth (ptr @"OBJC_METACLASS_$_C", i32 2, i64 27361, ptr @"OBJC_CLASS_$_C"), ptr ptrauth (ptr @"OBJC_CLASS_$_Base", i32 2, i64 46507, ptr getelementptr inbounds (%struct._class_t, ptr @"OBJC_CLASS_$_C", i32 0, i32 1)), ptr @_objc_empty_cache, ptr null, ptr @"_OBJC_CLASS_RO_$_C" }
+// CHECK: @"OBJC_LABEL_CLASS_$" = private global [1 x ptr] [ptr @"OBJC_CLASS_$_C"]
+
+@interface Base
++ (void)test;
+@end
+
+@interface C : Base
+@end
+
+@implementation C
+// CHECK-LABEL: define internal void @"\01+[C super_test]"(ptr %self, ptr %_cmd) #1 {
++ (void)super_test {
+  return [super test];
+  // CHECK: [[SELF_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: [[CMD_ADDR:%.*]] = alloca ptr, align 8
+  // CHECK: [[SUPER_STRUCT:%.*]] = alloca %struct._objc_super, align 8
+  // CHECK: store ptr %self, ptr [[SELF_ADDR]], align 8, !tbaa !{{[0-9]+}}
+  // CHECK: store ptr %_cmd, ptr [[CMD_ADDR]], align 8, !tbaa !{{[0-9]+}}
+  // CHECK: [[TARGET:%.*]] = load ptr, ptr [[SELF_ADDR]], align 8, !tbaa !{{[0-9]+}}
+  // CHECK: [[OBJC_SUPER_TARGET:%.*]] = getelementptr inbounds nuw %struct._objc_super, ptr [[SUPER_STRUCT]], i32 0, i32 0
+  // CHECK: store ptr [[TARGET]], ptr [[OBJC_SUPER_TARGET]], align 8
+  // CHECK: [[SUPER_REFERENCES:%.*]] = load ptr, ptr @"OBJC_CLASSLIST_SUP_REFS_$_"
+  // CHECK: [[OBJC_SUPER_SUPER:%.*]] = getelementptr inbounds nuw %struct._objc_super, ptr [[SUPER_STRUCT]], i32 0, i32 1
+  // CHECK: store ptr [[SUPER_REFERENCES]], ptr [[OBJC_SUPER_SUPER:%.*]], align 8
+  // CHECK: call void @objc_msgSendSuper2(ptr %objc_super, ptr %4)
+}
+@end
+
+id str = @"";
diff --git a/clang/test/CodeGenObjC/ptrauth-objc-method-list-pointer.m b/clang/test/CodeGenObjC/ptrauth-objc-method-list-pointer.m
new file mode 100644
index 0000000000000..e7cb5642ee490
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-objc-method-list-pointer.m
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -Wno-objc-root-class -fptrauth-calls -triple arm64e -fptrauth-objc-class-ro %s -emit-llvm -o - | FileCheck %s
+@interface X
+@end
+
+@implementation X
+-(void)meth {}
+@end
+
+// CHECK: @"OBJC_CLASS_$_X" = global %struct._class_t { ptr @"OBJC_METACLASS_$_X", ptr null, ptr @_objc_empty_cache, ptr null, ptr ptrauth (ptr @"_OBJC_CLASS_RO_$_X", i32 2, i64 25080, ptr getelementptr inbounds (%struct._class_t, ptr @"OBJC_CLASS_$_X", i32 0, i32 4)) }
+// CHECK: @"OBJC_METACLASS_$_X" = global %struct._class_t { ptr @"OBJC_METACLASS_$_X", ptr @"OBJC_CLASS_$_X", ptr @_objc_empty_cache, ptr null, ptr ptrauth (ptr @"_OBJC_METACLASS_RO_$_X", i32 2, i64 25080, ptr getelementptr inbounds (%struct._class_t, ptr @"OBJC_METACLASS_$_X", i32 0, i32 4)) }
+// CHECK: @OBJC_CLASS_NAME_ = private unnamed_addr constant [2 x i8] c"X\00"
+// CHECK: @"_OBJC_METACLASS_RO_$_X" = private global %struct._class_ro_t { i32 3, i32 40, i32 40, ptr null, ptr @OBJC_CLASS_NAME_, ptr null, ptr null, ptr null, ptr null, ptr null }
+// CHECK: @OBJC_METH_VAR_NAME_ = private unnamed_addr constant [5 x i8] c"meth\00"
+// CHECK: @OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [8 x i8] c"v16@0:8\00"
+// CHECK: @"_OBJC_$_INSTANCE_METHODS_X" = private global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { ptr @OBJC_METH_VAR_NAME_, ptr @OBJC_METH_VAR_TYPE_, ptr ptrauth (ptr @"\01-[X meth]", i32 0, i64 0, ptr getelementptr inbounds ({ i32, i32, [1 x %struct._objc_method] }, ptr @"_OBJC_$_INSTANCE_METHODS_X", i32 0, i32 2, i32 0, i32 2)) }] }
+// CHECK: @"_OBJC_CLASS_RO_$_X" = private global %struct._class_ro_t { i32 2, i32 0, i32 0, ptr null, ptr @OBJC_CLASS_NAME_, ptr ptrauth (ptr @"_OBJC_$_INSTANCE_METHODS_X", i32 2, i64 49936, ptr getelementptr inbounds (%struct._class_ro_t, ptr @"_OBJC_CLASS_RO_$_X", i32 0, i32 5)), ptr null, ptr null, ptr null, ptr null }
+// CHECK: @"OBJC_LABEL_CLASS_$" = private global [1 x ptr] [ptr @"OBJC_CLASS_$_X"]
diff --git a/clang/test/CodeGenObjC/ptrauth-property-backing.m b/clang/test/CodeGenObjC/ptrauth-property-backing.m
new file mode 100644
index 0000000000000..8c6bcb45e7446
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-property-backing.m
@@ -0,0 +1,80 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -emit-llvm -fexceptions -fptrauth-intrinsics -o - %s | FileCheck %s
+
+typedef void (*func)();
+
+__attribute__((objc_root_class))
+@interface Root {
+  Class isa;
+  void *__ptrauth(1, 1, 1) _field1;
+  void *__ptrauth(1, 1, 1) _field2;
+  func __ptrauth(1, 1, 1) _field3;
+  func __ptrauth(1, 1, 123) _field4;
+}
+
+@property void *field1;
+@property(nonatomic) void *field2;
+@property func field3;
+@property(nonatomic) func field4;
+@end
+
+@implementation Root
+@end
+
+// CHECK-LABEL: define internal ptr @"\01-[Root field1]"
+// CHECK: [[LOAD:%.*]] = load atomic i64, ptr [[ADDR:%.*]] unordered
+// CHECK: [[CAST_ADDR:%.*]] = ptrtoint ptr [[ADDR]] to i64
+// CHECK: [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ADDR]], i64 1)
+// CHECK: [[RESULT:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[LOAD]], i32 1, i64 [[BLEND]])
+
+// CHECK-LABEL: define internal void @"\01-[Root setField1:]"
+// CHECK: [[CAST_ADDR:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+// CHECK: [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ADDR]], i64 1)
+// CHECK: [[RESULT:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[VALUE:%.*]], i32 1, i64 [[BLEND]])
+// CHECK: [[PHI:%.*]] = phi i64 [ 0, {{%.*}} ], [ [[RESULT]], {{%.*}} ]
+// CHECK: store atomic i64 [[PHI]], ptr [[ADDR]] unordered
+
+// CHECK-LABEL: define internal ptr @"\01-[Root field2]"
+// CHECK: load ptr, ptr
+// CHECK: [[LOAD:%.*]] = load ptr, ptr [[ADDR:%.*]],
+// CHECK: [[CAST_ADDR:%.*]] = ptrtoint ptr [[ADDR]] to i64
+// CHECK: [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ADDR:%.*]], i64 1)
+// CHECK: [[VALUE:%.*]] = ptrtoint ptr [[LOAD]] to i64
+// CHECK: [[RESULT:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VALUE]], i32 1, i64 [[BLEND]])
+
+// CHECK-LABEL: define internal void @"\01-[Root setField2:]"
+// CHECK: [[CAST_ADDR:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+// CHECK: [[BLEND:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ADDR]], i64 1)
+// CHECK: [[CAST_VALUE:%.*]] = ptrtoint ptr [[VALUE:%.*]] to i64
+// CHECK: [[SIGNED:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[CAST_VALUE]], i32 1, i64 [[BLEND]])
+// CHECK: [[RESULT:%.*]] = inttoptr i64 [[SIGNED]] to ptr
+// CHECK: [[PHI:%.*]] = phi ptr [ null, {{%.*}} ], [ [[RESULT]], {{%.*}} ]
+// CHECK: store ptr [[PHI]], ptr [[ADDR]]
+
+// CHECK-LABEL: define internal ptr @"\01-[Root field3]"
+// CHECK: [[VALUE:%.*]] = load atomic i64, ptr [[ADDR:%.*]] unordered, align 8
+// CHECK: [[CASTED_ADDR:%.*]] = ptrtoint ptr [[ADDR]] to i64
+// CHECK: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CASTED_ADDR]], i64 1)
+// CHECK: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[VALUE]], i32 1, i64 [[BLENDED]], i32 0, i64 0
+
+// CHECK-LABEL: define internal void @"\01-[Root setField3:]"
+// CHECK: [[VALUE:%.*]] = load i64, ptr {{%.*}}, align 8
+// CHECK: [[CASTED_ADDR:%.*]] = ptrtoint ptr {{%.*}} to i64
+// CHECK: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CASTED_ADDR]], i64 1)
+// CHECK: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[VALUE]], i32 0, i64 0, i32 1, i64 [[BLENDED]])
+// CHECK: store atomic i64
+
+// CHECK-LABEL: define internal ptr @"\01-[Root field4]"
+// CHECK: load ptr, ptr
+// CHECK: [[VALUE:%.*]] = load ptr, ptr [[ADDR:%.*]],
+// CHECK: [[CASTED_ADDR:%.*]] = ptrtoint ptr [[ADDR]] to i64
+// CHECK: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CASTED_ADDR]], i64 123)
+// CHECK: [[CAST_VALUE:%.*]] = ptrtoint ptr [[VALUE]] to i64
+// CHECK: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[CAST_VALUE]], i32 1, i64 [[BLENDED]], i32 0, i64 0)
+
+// CHECK-LABEL: define internal void @"\01-[Root setField4:]"
+// CHECK: [[CAST_ADDR:%.*]] = ptrtoint ptr {{%.*}} to i64
+// CHECK: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_ADDR]], i64 123)
+// CHECK: resign.nonnull:
+// CHECK: [[VALUE:%.*]] = ptrtoint ptr %1 to i64
+// CHECK: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[VALUE]], i32 0, i64 0, i32 1, i64 [[BLENDED]])
+
diff --git a/clang/test/CodeGenObjCXX/ptrauth-objc-interface-selector.mm b/clang/test/CodeGenObjCXX/ptrauth-objc-interface-selector.mm
new file mode 100644
index 0000000000000..fc90f5ffcbcf6
--- /dev/null
+++ b/clang/test/CodeGenObjCXX/ptrauth-objc-interface-selector.mm
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -O0 -Wobjc-root-class -fptrauth-intrinsics -fptrauth-calls -nostdsysteminc -triple arm64e-apple-ios -emit-llvm -fptrauth-objc-interface-sel -o - %s | FileCheck --check-prefix=CHECK-AUTHENTICATED-SEL %s
+// RUN: %clang_cc1 -O0 -Wobjc-root-class -fptrauth-intrinsics -fptrauth-calls -nostdsysteminc -triple arm64e-apple-ios -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-UNAUTHENTICATED-SEL %s
+
+#include <ptrauth.h>
+#define __ptrauth_objc_sel_override                 \
+  __ptrauth(ptrauth_key_objc_sel_pointer, 1, 22467)
+
+extern "C" {
+
+@interface Test {
+@public
+  SEL auto_sel;
+@public
+  const SEL const_auto_sel;
+@public
+  volatile SEL volatile_auto_sel;
+@public
+  SEL __ptrauth_objc_sel_override manual;
+@public
+  const SEL __ptrauth_objc_sel_override const_manual;
+@public
+  volatile SEL __ptrauth_objc_sel_override volatile_manual;
+}
+
+@end
+#if __has_feature(ptrauth_objc_interface_sel)
+typedef const SEL __ptrauth_objc_sel const_auto_sel_ptr_type;
+const_auto_sel_ptr_type *const_auto_sel_ptr_type_test;
+typedef volatile SEL __ptrauth_objc_sel volatile_auto_sel_ptr_type;
+volatile_auto_sel_ptr_type *volatile_auto_sel_ptr_type_test;
+#else
+typedef const SEL const_auto_sel_ptr_type;
+const_auto_sel_ptr_type *const_auto_sel_ptr_type_test;
+typedef volatile SEL volatile_auto_sel_ptr_type;
+volatile_auto_sel_ptr_type *volatile_auto_sel_ptr_type_test;
+#endif
+
+void auto_sel(Test *out, Test *in) {
+  out->auto_sel = in->auto_sel;
+}
+// CHECK-AUTHENTICATED-SEL: define void @auto_sel
+// CHECK-AUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
+
+// CHECK-UNAUTHENTICATED-SEL: define void @auto_sel
+SEL const_auto_sel(Test *in) {
+  const_auto_sel_ptr_type_test = &in->const_auto_sel;
+  return in->const_auto_sel;
+}
+
+// CHECK-AUTHENTICATED-SEL: define ptr @const_auto_sel
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 22466)
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = ptrtoint ptr {{%.*}} to i64
+// CHECK-AUTHENTICATED-SEL: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 {{%.*}}, i32 3, i64 {{%.*}})
+// CHECK-AUTHENTICATED-SEL: [[RESULT:%.*]] = inttoptr i64 [[AUTHENTICATED]] to ptr
+
+void volatile_auto_sel(Test *out, Test *in) {
+  volatile_auto_sel_ptr_type_test = &in->volatile_auto_sel;
+  out->volatile_auto_sel = in->volatile_auto_sel;
+}
+
+// CHECK-AUTHENTICATED-SEL: define void @volatile_auto_sel
+// CHECK-AUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22466)
+// CHECK-AUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
+
+void manual(Test *out, Test *in) {
+  out->manual = in->manual;
+}
+
+// CHECK-AUTHENTICATED-SEL: define void @manual
+// CHECK-AUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22467)
+// CHECK-AUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22467)
+// CHECK-AUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-AUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
+
+// CHECK-UNAUTHENTICATED-SEL: define void @manual
+// CHECK-UNAUTHENTICATED-SEL: [[DST_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_DST_ADDR:%.*]], i64 22467)
+// CHECK-UNAUTHENTICATED-SEL: [[CAST_SRC_ADDR:%.*]] = ptrtoint ptr [[SRC_ADDR:%.*]] to i64
+// CHECK-UNAUTHENTICATED-SEL: [[SRC_DESCRIMINATOR:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[CAST_SRC_ADDR]], i64 22467)
+// CHECK-UNAUTHENTICATED-SEL: [[SRC_SEL:%.*]] = ptrtoint ptr [[SRC_SEL_ADDR:%.*]] to i64
+// CHECK-UNAUTHENTICATED-SEL: {{%.*}} = call i64 @llvm.ptrauth.resign(i64 [[SRC_SEL]], i32 3, i64 [[DST_DESCRIMINATOR]], i32 3, i64 [[SRC_DESCRIMINATOR]])
+
+}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 42768ac8def1f..75e9710f96705 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -108,7 +108,7 @@
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
new file mode 100644
index 0000000000000..e4ef3defdb341
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -0,0 +1,433 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h  __attribute__((ext_vector_type(32)));
+typedef __bf16 v32bf16 __attribute__((ext_vector_type(32)));
+typedef __bf16 v16bf16 __attribute__((ext_vector_type(16)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
+typedef int    v16i   __attribute__((ext_vector_type(16)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x4_f32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> [[A:%.*]], i1 false, <2 x float> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 false)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, true, false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16_16x16x32_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x bfloat> [[C:%.*]], i1 false, i1 false)
+// CHECK-GFX1250-NEXT:    store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8bf16 c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, true, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x32_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, true, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]])
+// CHECK-GFX1250-NEXT:    store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16_16x16x64_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, v32bf16 b, v8bf16 c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16f32_16x16x64_bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i32(i1 false, <16 x half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x64_f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i32(i1 false, <16 x half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x64_f16(global v8h* out, v16h a, v32h b, v8h c, int index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, false, true);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 830ceabb1cb29..738b7ab7f2b75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -42,6 +42,45 @@ void test_s_wait_tensorcnt() {
   __builtin_amdgcn_s_wait_tensorcnt(0);
 }
 
+// CHECK-LABEL: @test_tanh_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_tanh_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_tanhf(a);
+}
+
+// CHECK-LABEL: @test_tanh_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr addrspace(1) [[TMP0]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = call half @llvm.amdgcn.tanh.f16(half [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store half [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_tanh_f16(global half* out, global half* a)
+{
+  *out = __builtin_amdgcn_tanhh(*a);
+}
+
 // CHECK-LABEL: @test_tanh_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -61,6 +100,120 @@ void test_tanh_bf16(global __bf16* out, __bf16 a)
   *out = __builtin_amdgcn_tanh_bf16(a);
 }
 
+// CHECK-LABEL: @test_rcp_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rcp.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_rcp_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_rcp_bf16(a);
+}
+
+// CHECK-LABEL: @test_rsq_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_rsq_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_rsq_bf16(a);
+}
+
+// CHECK-LABEL: @test_log_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_log_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_log_bf16(a);
+}
+
+// CHECK-LABEL: @test_exp2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.exp2.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_exp2_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_exp2_bf16(a);
+}
+
+// CHECK-LABEL: @test_sin_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sin.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_sin_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_sin_bf16(a);
+}
+
+// CHECK-LABEL: @test_cos_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call bfloat @llvm.amdgcn.cos.bf16(bfloat [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+void test_cos_bf16(global __bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_cos_bf16(a);
+}
+
 // CHECK-LABEL: @test_cvt_f16_fp8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
index 8256b61525f9d..177165972b7a9 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
@@ -10,3 +10,12 @@
 void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) {
     __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3);
 }
+
+// CHECK-LABEL: @test_amdgcn_struct_ptr_buffer_load_lds(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 4, i32 [[VINDEX:%.*]], i32 [[VOFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset) {
+    __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 2, 3);
+}
diff --git a/clang/test/CodeGenOpenCL/scoped-atomic.cl b/clang/test/CodeGenOpenCL/scoped-atomic.cl
new file mode 100644
index 0000000000000..ec7e936684a3a
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/scoped-atomic.cl
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir-unknown-unknown -verify
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir64-unknown-unknown -verify
+
+// expected-no-diagnostics
+
+int fi1a(int *i) {
+  int v;
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+  return v;
+}
+
+#ifdef __SPIR64__
+long fl1a(long *i) {
+  long v;
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
+  return v;
+}
+#endif
diff --git a/clang/test/CodeGenSPIRV/Builtins/refract.c b/clang/test/CodeGenSPIRV/Builtins/refract.c
new file mode 100644
index 0000000000000..f399462d68d4a
--- /dev/null
+++ b/clang/test/CodeGenSPIRV/Builtins/refract.c
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -O1 -triple spirv-pc-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s
+
+typedef _Float16 half;
+typedef half half2 __attribute__((ext_vector_type(2)));
+typedef half half3 __attribute__((ext_vector_type(3)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+
+// CHECK-LABEL: define spir_func half @test_refract_half(
+// CHECK-SAME: half noundef [[I:%.*]], half noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call half @llvm.spv.refract.f16.f16(half [[I]], half [[N]], half [[ETA]])
+// CHECK-NEXT:    ret half [[SPV_REFRACT]]
+//
+half test_refract_half(half I, half N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <2 x half> @test_refract_half2(
+// CHECK-SAME: <2 x half> noundef [[I:%.*]], <2 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call <2 x half> @llvm.spv.refract.v2f16.f16(<2 x half> [[I]], <2 x half> [[N]], half [[ETA]])
+// CHECK-NEXT:    ret <2 x half> [[SPV_REFRACT]]
+//
+half2 test_refract_half2(half2 I, half2 N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <3 x half> @test_refract_half3(
+// CHECK-SAME: <3 x half> noundef [[I:%.*]], <3 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <3 x half> @llvm.spv.refract.v3f16.f16(<3 x half> [[I]], <3 x half> [[N]], half [[ETA]])
+// CHECK-NEXT:    ret <3 x half> [[SPV_REFRACT]]
+//
+half3 test_refract_half3(half3 I, half3 N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <4 x half> @test_refract_half4(
+// CHECK-SAME: <4 x half> noundef [[I:%.*]], <4 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> [[I]], <4 x half> [[N]], half [[ETA]])
+// CHECK-NEXT:    ret <4 x half> [[SPV_REFRACT]]
+//
+half4 test_refract_half4(half4 I, half4 N, half eta) { return __builtin_spirv_refract(I, N, eta); }
+
+
+// CHECK-LABEL: define spir_func float @test_refract_float(
+// CHECK-SAME: float noundef [[I:%.*]], float noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call float @llvm.spv.refract.f32.f32(float [[I]], float [[N]], float [[ETA]])
+// CHECK-NEXT:    ret float [[SPV_REFRACT]]
+//
+float test_refract_float(float I, float N, float eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <2 x float> @test_refract_float2(
+// CHECK-SAME: <2 x float> noundef [[I:%.*]], <2 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK:    [[SPV_REFRACT:%.*]] = tail call <2 x float> @llvm.spv.refract.v2f32.f32(<2 x float> [[I]], <2 x float> [[N]], float [[ETA]])
+// CHECK-NEXT:    ret <2 x float> [[SPV_REFRACT]]
+//
+float2 test_refract_float2(float2 I, float2 N, float eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <3 x float> @test_refract_float3(
+// CHECK-SAME: <3 x float> noundef [[I:%.*]], <3 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <3 x float> @llvm.spv.refract.v3f32.f32(<3 x float> [[I]], <3 x float> [[N]], float [[ETA]])
+// CHECK-NEXT:    ret <3 x float> [[SPV_REFRACT]]
+//
+float3 test_refract_float3(float3 I, float3 N, float eta) { return __builtin_spirv_refract(I, N, eta); }
+
+// CHECK-LABEL: define spir_func <4 x float> @test_refract_float4(
+// CHECK-SAME: <4 x float> noundef [[I:%.*]], <4 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SPV_REFRACT:%.*]] = tail call <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> [[I]], <4 x float> [[N]], float [[ETA]])
+// CHECK-NEXT:    ret <4 x float> [[SPV_REFRACT]]
+//
+float4 test_refract_float4(float4 I, float4 N, float eta) { return __builtin_spirv_refract(I, N, eta); }
diff --git a/clang/test/DebugInfo/KeyInstructions/array-cookie.cpp b/clang/test/DebugInfo/KeyInstructions/array-cookie.cpp
new file mode 100644
index 0000000000000..cfa343551d162
--- /dev/null
+++ b/clang/test/DebugInfo/KeyInstructions/array-cookie.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -gkey-instructions %s -debug-info-kind=line-tables-only -gno-column-info -emit-llvm -o - \
+// RUN: | FileCheck %s
+
+// Array cookie store doesn't need to be a key instruction.
+
+struct a { char c; ~a(); };
+void b() { new a[2]; }
+
+// CHECK:      %call = call {{.*}}ptr @_Znam(i64 noundef 10)
+// CHECK-NEXT: store i64 2, ptr %call, align 8, !dbg [[DBG:!.*]]
+
+// CHECK: [[DBG]] = !DILocation(line: 7, scope: ![[#]])
diff --git a/clang/test/Driver/DTLTO/dtlto.c b/clang/test/Driver/DTLTO/dtlto.c
new file mode 100644
index 0000000000000..96795d9a4e6a4
--- /dev/null
+++ b/clang/test/Driver/DTLTO/dtlto.c
@@ -0,0 +1,48 @@
+// REQUIRES: lld
+
+/// Check DTLTO options are forwarded to the linker.
+
+/// Check that options are forwarded as expected with --thinlto-distributor=.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 \
+// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=FORWARD
+
+// FORWARD: ld.lld
+// FORWARD-SAME: "--thinlto-distributor=d.exe"
+// FORWARD-SAME: "--thinlto-remote-compiler={{[^"]+}}"
+// FORWARD-SAME: "--thinlto-distributor-arg=a1"
+// FORWARD-SAME: "--thinlto-distributor-arg=a2"
+// FORWARD-SAME: "--thinlto-distributor-arg=a3"
+
+/// Check that options are not added without --thinlto-distributor= and
+/// that a warning is issued for unused -Xthinlto-distributor options.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=NODIST --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a1'
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a2,a3'
+// NODIST: ld.lld
+
+/// Check the expected arguments are forwarded by default with only
+/// --thinlto-distributor=.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=DEFAULT --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// DEFAULT: ld.lld
+// DEFAULT-SAME: "--thinlto-distributor=d.exe"
+// DEFAULT-SAME: "--thinlto-remote-compiler={{.*}}clang{{[^\"]*}}"
+
+/// Check that nothing is forwarded when the compiler is not in LTO mode, and that
+/// appropriate unused option warnings are issued.
+// RUN: %clang %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -fthinlto-distributor=d.exe 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=NOFLTO --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// NOFLTO: warning: argument unused during compilation: '-fthinlto-distributor=d.exe'
+// NOFLTO: ld.lld
diff --git a/clang/test/Driver/arm-alignment.c b/clang/test/Driver/arm-alignment.c
index b714f80a07dc1..3ddf29b31dbda 100644
--- a/clang/test/Driver/arm-alignment.c
+++ b/clang/test/Driver/arm-alignment.c
@@ -16,9 +16,6 @@
 // RUN: %clang -target armv7-unknown-linux -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
 
-// RUN: %clang -target armv7-unknown-nacl-gnueabihf -### %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
-
 // RUN: %clang -target armv7-windows -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
 
@@ -80,9 +77,6 @@
 // RUN: %clang -target armv6-unknown-linux -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s
 
-// RUN: %clang -target armv6-unknown-nacl-gnueabihf -### %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s
-
 // RUN: %clang -target armv6m-apple-darwin -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s
 
diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c
index f00940bd7613d..6e21671f43775 100644
--- a/clang/test/Driver/frame-pointer-elim.c
+++ b/clang/test/Driver/frame-pointer-elim.c
@@ -162,7 +162,7 @@
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: not %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 
 // On ARM backend bare metal targets, frame pointer is omitted
diff --git a/clang/test/Driver/mingw.cpp b/clang/test/Driver/mingw.cpp
index 66da0c97f4166..f43fa177e2905 100644
--- a/clang/test/Driver/mingw.cpp
+++ b/clang/test/Driver/mingw.cpp
@@ -85,6 +85,10 @@
 // RUN:   | FileCheck %s --check-prefix CHECK_MINGW_EC_LINK
 // CHECK_MINGW_EC_LINK: "-m" "arm64ecpe"
 
+// RUN: %clang --target=aarch64-windows-gnu -marm64x -### -o /dev/null %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix CHECK_MINGW_A64X_LINK
+// CHECK_MINGW_A64X_LINK: "-m" "arm64xpe"
+
 // RUN: %clang --target=mipsel-windows-gnu -### -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK_MINGW_MIPSPE
 // CHECK_MINGW_MIPSPE: "-m" "mipspe"
diff --git a/clang/test/Driver/nacl-direct.c b/clang/test/Driver/nacl-direct.c
deleted file mode 100644
index b1a80b3e9f837..0000000000000
--- a/clang/test/Driver/nacl-direct.c
+++ /dev/null
@@ -1,146 +0,0 @@
-// Test clang changes for NaCl Support including:
-//    include paths, library paths, emulation, default static
-//
-// RUN: %clang -### %s \
-// RUN:     --target=i686-unknown-nacl -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-I686 %s
-// CHECK-I686: "-cc1"
-// CHECK-I686-NOT: "-fno-use-init-array"
-// CHECK-I686: "-target-cpu" "pentium4"
-// CHECK-I686: "-resource-dir" "foo"
-// CHECK-I686: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-I686: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}i686-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-I686: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}include"
-// CHECK-I686: as{{(.exe)?}}" "--32"
-// CHECK-I686: ld{{(.exe)?}}"
-// CHECK-I686: "--build-id"
-// CHECK-I686: "-m" "elf_i386_nacl"
-// CHECK-I686: "-static"
-// CHECK-I686: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}lib32"
-// CHECK-I686: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}i686-nacl{{/|\\\\}}usr{{/|\\\\}}lib"
-// CHECK-I686: "-Lfoo{{/|\\\\}}lib{{/|\\\\}}i686-nacl"
-// CHECK-I686-NOT: -lpthread
-//
-// RUN: %clang -### %s \
-// RUN:     --target=x86_64-unknown-nacl -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-x86_64 %s
-// CHECK-x86_64: "-cc1"
-// CHECK-x86_64-NOT: "-fno-use-init-array"
-// CHECK-x86_64: "-target-cpu" "x86-64"
-// CHECK-x86_64: "-resource-dir" "foo"
-// CHECK-x86_64: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-x86_64: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-x86_64: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}include"
-// CHECK-x86_64: as{{(.exe)?}}" "--64"
-// CHECK-x86_64: ld{{(.exe)?}}"
-// CHECK-x86_64: "--build-id"
-// CHECK-x86_64: "-m" "elf_x86_64_nacl"
-// CHECK-x86_64: "-static"
-// CHECK-x86_64: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}lib"
-// CHECK-x86_64: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}usr{{/|\\\\}}lib"
-// CHECK-x86_64: "-Lfoo{{/|\\\\}}lib{{/|\\\\}}x86_64-nacl"
-// CHECK-X86_64-NOT: -lpthread
-//
-// RUN: %clang -### %s \
-// RUN:     --target=armv7a-unknown-nacl-gnueabihf -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-ARM %s
-// CHECK-ARM: "-cc1"
-// CHECK-ARM-NOT: "-fno-use-init-array"
-// CHECK-ARM: "-target-cpu" "generic"
-// CHECK-ARM: "-target-abi" "aapcs-linux"
-// CHECK-ARM: "-mfloat-abi" "hard"
-// CHECK-ARM: "-resource-dir" "foo"
-// CHECK-ARM: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-ARM: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}arm-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-ARM: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}arm-nacl{{/|\\\\}}include"
-// CHECK-ARM: as{{(.exe)?}}"
-// CHECK-ARM: "-mfloat-abi=hard"
-// CHECK-ARM: ld{{(.exe)?}}"
-// CHECK-ARM: "--build-id"
-// CHECK-ARM: "-m" "armelf_nacl"
-// CHECK-ARM: "-static"
-// CHECK-ARM: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}arm-nacl{{/|\\\\}}lib"
-// CHECK-ARM: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}arm-nacl{{/|\\\\}}usr{{/|\\\\}}lib"
-// CHECK-ARM: "-Lfoo{{/|\\\\}}lib{{/|\\\\}}arm-nacl"
-// CHECK-ARM-NOT: -lpthread
-//
-// RUN: %clang -### %s \
-// RUN:     --target=mipsel-unknown-nacl -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-MIPS %s
-// CHECK-MIPS: "-cc1"
-// CHECK-MIPS-NOT: "-fno-use-init-array"
-// CHECK-MIPS: "-target-cpu" "mips32r2"
-// CHECK-MIPS: "-target-abi" "o32"
-// CHECK-MIPS: "-mfloat-abi" "hard"
-// CHECK-MIPS: "-resource-dir" "foo"
-// CHECK-MIPS: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-MIPS: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}mipsel-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-MIPS: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}mipsel-nacl{{/|\\\\}}include"
-// CHECK-MIPS-NOT: as{{(.exe)?}}"
-// CHECK-MIPS: ld{{(.exe)?}}"
-// CHECK-MIPS: "--build-id"
-// CHECK-MIPS: "-m" "mipselelf_nacl"
-// CHECK-MIPS: "-static"
-// CHECK-MIPS: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}mipsel-nacl{{/|\\\\}}lib"
-// CHECK-MIPS: "-L{{.*}}{{/|\\\\}}..{{/|\\\\}}mipsel-nacl{{/|\\\\}}usr{{/|\\\\}}lib"
-// CHECK-MIPS: "-Lfoo{{/|\\\\}}lib{{/|\\\\}}mipsel-nacl"
-// CHECK-MIPS: "-lpnacl_legacy"
-// CHECK-MIPS-NOT: "-lpthread"
-
-// Check that even when the target arch is just "arm" (as will be the case when
-// it is inferred from the binary name) that we get the right ABI flags
-// RUN: %clang -### %s 2>&1 \
-// RUN:     --target=arm-nacl \
-// RUN:   | FileCheck --check-prefix=CHECK-ARM-NOV7 %s
-// CHECK-ARM-NOV7: "-triple" "armv7-unknown-nacl-gnueabihf"
-// CHECK-ARM-NOV7: "-target-abi" "aapcs-linux"
-// CHECK-ARM-NOV7: "-mfloat-abi" "hard"
-// CHECK-ARM-NOV7: as{{(.exe)?}}"
-// CHECK-ARM-NOV7: "-mfloat-abi=hard"
-
-// Test clang c++ include dirs and link line when using clang++
-
-// RUN: %clangxx -### %s \
-// RUN:     --target=armv7a-unknown-nacl-gnueabihf -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-ARM-CXX %s
-// CHECK-ARM-CXX: "-cc1"
-// CHECK-ARM-CXX: "-resource-dir" "foo"
-// CHECK-ARM-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}arm-nacl{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1"
-// CHECK-ARM-CXX: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-ARM-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}arm-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-ARM-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}arm-nacl{{/|\\\\}}include"
-// CHECK-ARM-CXX: "-lpthread"
-
-// RUN: %clangxx -### %s \
-// RUN:     --target=i686-unknown-nacl -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-I686-CXX %s
-// CHECK-I686-CXX: "-cc1"
-// CHECK-I686-CXX: "-resource-dir" "foo"
-// CHECK-I686-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1"
-// CHECK-I686-CXX: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-I686-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}i686-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-I686-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}include"
-// CHECK-I686-CXX: "-lpthread"
-
-// RUN: %clangxx -### %s \
-// RUN:     --target=x86_64-unknown-nacl -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-x86_64-CXX %s
-// CHECK-x86_64-CXX: "-cc1"
-// CHECK-x86_64-CXX: "-resource-dir" "foo"
-// CHECK-x86_64-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1"
-// CHECK-x86_64-CXX: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-x86_64-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-x86_64-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}x86_64-nacl{{/|\\\\}}include"
-// CHECK-x86_64-CXX: "-lpthread"
-
-// RUN: %clangxx -### %s \
-// RUN:     --target=mipsel-unknown-nacl -resource-dir foo 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-MIPS-CXX %s
-// CHECK-MIPS-CXX: "-cc1"
-// CHECK-MIPS-CXX: "-resource-dir" "foo"
-// CHECK-MIPS-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}mipsel-nacl{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1"
-// CHECK-MIPS-CXX: "-internal-isystem" "foo{{/|\\\\}}include"
-// CHECK-MIPS-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}mipsel-nacl{{/|\\\\}}usr{{/|\\\\}}include"
-// CHECK-MIPS-CXX: "-internal-isystem" "{{.*}}{{/|\\\\}}..{{/|\\\\}}mipsel-nacl{{/|\\\\}}include"
-// CHECK-MIPS-CXX: "-lnacl"
-// CHECK-MIPS-CXX: "-lpthread"
diff --git a/clang/test/Driver/openacc.c b/clang/test/Driver/openacc.c
index c7f1d2545bd03..f46e2a32bcab2 100644
--- a/clang/test/Driver/openacc.c
+++ b/clang/test/Driver/openacc.c
@@ -1,14 +1,2 @@
 // RUN: %clang -S -### -fopenacc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DRIVER
 // CHECK-DRIVER: "-cc1" {{.*}} "-fopenacc"
-
-// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override=202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE
-// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override 202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE
-// CHECK-MACRO-OVERRIDE: "-cc1"{{.*}} "-fexperimental-openacc-macro-override" "202211"
-
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// INVALID: error: the clang compiler does not support
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index b87f0cb568e00..2503f2473d64a 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -158,6 +158,7 @@
 // CHECK-NEXT:     svnapot              1.0       'Svnapot' (NAPOT Translation Contiguity)
 // CHECK-NEXT:     svpbmt               1.0       'Svpbmt' (Page-Based Memory Types)
 // CHECK-NEXT:     svvptc               1.0       'Svvptc' (Obviating Memory-Management Instructions after Marking PTEs Valid)
+// CHECK-NEXT:     xandesbfhcvt         5.0       'XAndesBFHCvt' (Andes Scalar BFLOAT16 Conversion Extension)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-NEXT:     xandesvbfhcvt        5.0       'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)
 // CHECK-NEXT:     xandesvdot           5.0       'XAndesVDot' (Andes Vector Dot Product Extension)
diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index d32d1c1a8183f..2698612f815d3 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -287,7 +287,7 @@
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mtune=xiangshan-nanhu | FileCheck -check-prefix=MTUNE-XIANGSHAN-NANHU %s
 // MTUNE-XIANGSHAN-NANHU: "-tune-cpu" "xiangshan-nanhu"
 
-// Check mtune alias CPU has resolved to the right CPU according XLEN.
+// Check -mtune alias CPU has resolved to the right CPU according XLEN.
 // RUN: %clang --target=riscv32 -### -c %s 2>&1 -mtune=generic | FileCheck -check-prefix=MTUNE-GENERIC-32 %s
 // MTUNE-GENERIC-32: "-tune-cpu" "generic"
 
@@ -304,21 +304,21 @@
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mtune=native | FileCheck -check-prefix=MTUNE-NATIVE %s
 // MTUNE-NATIVE-NOT: "-tune-cpu" "native"
 
-// mcpu with default march
+// -mcpu with default -march
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-e20 | FileCheck -check-prefix=MCPU-SIFIVE-E20 %s
 // MCPU-SIFIVE-E20: "-nostdsysteminc" "-target-cpu" "sifive-e20"
 // MCPU-SIFIVE-E20: "-target-feature" "+m" "-target-feature" "+c"
 // MCPU-SIFIVE-E20: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-SIFIVE-E20: "-target-abi" "ilp32"
 
-// mcpu with default march
+// -mcpu with default -march
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-e21 | FileCheck -check-prefix=MCPU-SIFIVE-E21 %s
 // MCPU-SIFIVE-E21: "-nostdsysteminc" "-target-cpu" "sifive-e21"
 // MCPU-SIFIVE-E21: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+c"
 // MCPU-SIFIVE-E21: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-SIFIVE-E21: "-target-abi" "ilp32"
 
-// mcpu with default march
+// -mcpu with default -march
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-e24 | FileCheck -check-prefix=MCPU-SIFIVE-E24 %s
 // MCPU-SIFIVE-E24: "-nostdsysteminc" "-target-cpu" "sifive-e24"
 // MCPU-SIFIVE-E24: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f"
@@ -326,7 +326,7 @@
 // MCPU-SIFIVE-E24: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-SIFIVE-E24: "-target-abi" "ilp32f"
 
-// mcpu with default march
+// -mcpu with default -march
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-e34 | FileCheck -check-prefix=MCPU-SIFIVE-E34 %s
 // MCPU-SIFIVE-E34: "-nostdsysteminc" "-target-cpu" "sifive-e34"
 // MCPU-SIFIVE-E34: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f"
@@ -334,7 +334,7 @@
 // MCPU-SIFIVE-E34: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-SIFIVE-E34: "-target-abi" "ilp32f"
 
-// mcpu with mabi option
+// -mcpu with -mabi option
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-s21 -mabi=lp64 | FileCheck -check-prefix=MCPU-ABI-SIFIVE-S21 %s
 // MCPU-ABI-SIFIVE-S21: "-nostdsysteminc" "-target-cpu" "sifive-s21"
 // MCPU-ABI-SIFIVE-S21: "-target-feature" "+m" "-target-feature" "+a"
@@ -342,7 +342,7 @@
 // MCPU-ABI-SIFIVE-S21: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-ABI-SIFIVE-S21: "-target-abi" "lp64"
 
-// mcpu with mabi option
+// -mcpu with -mabi option
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-s51 -mabi=lp64 | FileCheck -check-prefix=MCPU-ABI-SIFIVE-S51 %s
 // MCPU-ABI-SIFIVE-S51: "-nostdsysteminc" "-target-cpu" "sifive-s51"
 // MCPU-ABI-SIFIVE-S51: "-target-feature" "+m" "-target-feature" "+a"
@@ -350,7 +350,7 @@
 // MCPU-ABI-SIFIVE-S51: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-ABI-SIFIVE-S51: "-target-abi" "lp64"
 
-// mcpu with default march
+// -mcpu with default -march
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-s54 | FileCheck -check-prefix=MCPU-SIFIVE-S54 %s
 // MCPU-SIFIVE-S54: "-nostdsysteminc" "-target-cpu" "sifive-s54"
 // MCPU-SIFIVE-S54: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
@@ -358,7 +358,7 @@
 // MCPU-SIFIVE-S54: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-SIFIVE-S54: "-target-abi" "lp64d"
 
-// mcpu with mabi option
+// -mcpu with -mabi option
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-s76 | FileCheck -check-prefix=MCPU-SIFIVE-S76 %s
 // MCPU-SIFIVE-S76: "-nostdsysteminc" "-target-cpu" "sifive-s76"
 // MCPU-SIFIVE-S76: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
@@ -366,7 +366,7 @@
 // MCPU-SIFIVE-S76: "-target-feature" "+zicsr" "-target-feature" "+zifencei" "-target-feature" "+zihintpause"
 // MCPU-SIFIVE-S76: "-target-abi" "lp64d"
 
-// mcpu with default march
+// -mcpu with default -march
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-u54 | FileCheck -check-prefix=MCPU-SIFIVE-U54 %s
 // MCPU-SIFIVE-U54: "-nostdsysteminc" "-target-cpu" "sifive-u54"
 // MCPU-SIFIVE-U54: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
@@ -374,7 +374,7 @@
 // MCPU-SIFIVE-U54: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-SIFIVE-U54: "-target-abi" "lp64d"
 
-// mcpu with mabi option
+// -mcpu with -mabi option
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-u54 -mabi=lp64 | FileCheck -check-prefix=MCPU-ABI-SIFIVE-U54 %s
 // MCPU-ABI-SIFIVE-U54: "-nostdsysteminc" "-target-cpu" "sifive-u54"
 // MCPU-ABI-SIFIVE-U54: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
@@ -382,7 +382,7 @@
 // MCPU-ABI-SIFIVE-U54: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-ABI-SIFIVE-U54: "-target-abi" "lp64"
 
-// mcpu with default march
+// -mcpu with default -march
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-e76 | FileCheck -check-prefix=MCPU-SIFIVE-E76 %s
 // MCPU-SIFIVE-E76: "-nostdsysteminc" "-target-cpu" "sifive-e76"
 // MCPU-SIFIVE-E76: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f"
@@ -390,7 +390,7 @@
 // MCPU-SIFIVE-E76: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-SIFIVE-E76: "-target-abi" "ilp32f"
 
-// mcpu with mabi option
+// -mcpu with -mabi option
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=sifive-u74 -mabi=lp64 | FileCheck -check-prefix=MCPU-ABI-SIFIVE-U74 %s
 // MCPU-ABI-SIFIVE-U74: "-nostdsysteminc" "-target-cpu" "sifive-u74"
 // MCPU-ABI-SIFIVE-U74: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
@@ -398,13 +398,13 @@
 // MCPU-ABI-SIFIVE-U74: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MCPU-ABI-SIFIVE-U74: "-target-abi" "lp64"
 
-// march overwrite mcpu's default march
+// -march overwrite -mcpu's default -march
 // RUN: %clang --target=riscv32 -### -c %s 2>&1 -mcpu=sifive-e31 -march=rv32imc | FileCheck -check-prefix=MCPU-MARCH %s
 // MCPU-MARCH: "-nostdsysteminc" "-target-cpu" "sifive-e31" "-target-feature" "+m" "-target-feature" "+c"
 // MCPU-MARCH: "-target-abi" "ilp32"
 
-// Check interaction between mcpu and mtune, mtune won't affect arch related
-// target feature, but mcpu will.
+// Check interaction between -mcpu and mtune, -mtune won't affect arch related
+// target feature, but -mcpu will.
 //
 // In this case, sifive-e31 is rv32imac, sifive-e76 is rv32imafc, so F-extension
 // should not enabled.
@@ -418,7 +418,7 @@
 // MTUNE-E31-MCPU-E76-SAME: "-target-feature" "+zicsr" "-target-feature" "+zifencei"
 // MTUNE-E31-MCPU-E76-SAME: "-tune-cpu" "sifive-e76"
 
-// mcpu with default march include experimental extensions
+// -mcpu with default -march include experimental extensions
 // RUN: %clang -target riscv64 -### -c %s 2>&1 -menable-experimental-extensions -mcpu=sifive-x280 | FileCheck -check-prefix=MCPU-SIFIVE-X280 %s
 // MCPU-SIFIVE-X280: "-nostdsysteminc" "-target-cpu" "sifive-x280"
 // MCPU-SIFIVE-X280-SAME: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
diff --git a/clang/test/Driver/unsupported-target-arch.c b/clang/test/Driver/unsupported-target-arch.c
index c0ec235439bca..426f646db50b0 100644
--- a/clang/test/Driver/unsupported-target-arch.c
+++ b/clang/test/Driver/unsupported-target-arch.c
@@ -20,10 +20,6 @@
 // RUN: FileCheck --input-file=%t.err --check-prefix=CHECK-NOARCH-NETBSD %s
 // CHECK-NOARCH-NETBSD: error: unknown target triple 'noarch-unknown-netbsd'{{$}}
 //
-// RUN: not %clang --target=noarch-unknown-nacl -o %t.o %s 2> %t.err
-// RUN: FileCheck --input-file=%t.err --check-prefix=CHECK-NOARCH-NACL %s
-// CHECK-NOARCH-NACL:  error: the target architecture 'noarch' is not supported by the target 'Native Client'
-
 // RUN: not %clang --target=noarch-unknown-windows-gnu -o %t.o %s 2> %t.err
 // RUN: FileCheck --input-file=%t.err --check-prefix=CHECK-NOARCH-MINGW %s
 // CHECK-NOARCH-MINGW: error: unknown target triple 'noarch-unknown-windows-gnu'
diff --git a/clang/test/Driver/x86_64-nacl-defines.cpp b/clang/test/Driver/x86_64-nacl-defines.cpp
deleted file mode 100644
index 1b29fc7922419..0000000000000
--- a/clang/test/Driver/x86_64-nacl-defines.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: %clang --target=x86_64-unknown-nacl -### %s -c -o %t.o 2>&1 | FileCheck %s -check-prefix=ECHO
-// RUN: %clang -target x86_64-unknown-nacl %s -emit-llvm -S -c -o - | FileCheck %s
-// RUN: %clang -target x86_64-unknown-nacl %s -emit-llvm -S -c -pthread -o - | FileCheck %s -check-prefix=THREADS
-
-// ECHO: {{.*}} "-cc1" {{.*}}x86_64-nacl-defines.c
-
-// Check platform defines
-
-// CHECK: __LITTLE_ENDIAN__defined
-#ifdef __LITTLE_ENDIAN__
-void __LITTLE_ENDIAN__defined() {}
-#endif
-
-// CHECK: __native_client__defined
-#ifdef __native_client__
-void __native_client__defined() {}
-#endif
-
-// CHECK: __x86_64__defined
-#ifdef __x86_64__
-void __x86_64__defined() {}
-#endif
-
-// CHECK: unixdefined
-#ifdef unix
-void unixdefined() {}
-#endif
-
-// CHECK: __ELF__defined
-#ifdef __ELF__
-void __ELF__defined() {}
-#endif
-
-// CHECK: _GNU_SOURCEdefined
-#ifdef _GNU_SOURCE
-void _GNU_SOURCEdefined() {}
-#endif
-
-// THREADS: _REENTRANTdefined
-// CHECK: _REENTRANTundefined
-#ifdef _REENTRANT
-void _REENTRANTdefined() {}
-#else
-void _REENTRANTundefined() {}
-#endif
diff --git a/clang/test/Format/multiple-inputs-error.cpp b/clang/test/Format/multiple-inputs-error.cpp
index 7cb835d39f23e..1aa9c9f3e2fad 100644
--- a/clang/test/Format/multiple-inputs-error.cpp
+++ b/clang/test/Format/multiple-inputs-error.cpp
@@ -1,6 +1,6 @@
 // RUN: cp %s %t-1.cpp
 // RUN: cp %s %t-2.cpp
-// RUN: not clang-format 2>&1 >/dev/null -offset=1 -length=1 %t-1.cpp %t-2.cpp |FileCheck %s
+// RUN: not clang-format 2>&1 >/dev/null -offset=1 -length=0 %t-1.cpp %t-2.cpp |FileCheck %s
 // RUN: not clang-format 2>&1 >/dev/null -lines=1:1 %t-1.cpp %t-2.cpp |FileCheck %s -check-prefix=CHECK-LINE
 // CHECK: error: -offset, -length and -lines can only be used for single file.
 // CHECK-LINE: error: -offset, -length and -lines can only be used for single file.
diff --git a/clang/test/Format/ranges.cpp b/clang/test/Format/ranges.cpp
index f42492e43f84b..66b984e037b3c 100644
--- a/clang/test/Format/ranges.cpp
+++ b/clang/test/Format/ranges.cpp
@@ -1,5 +1,5 @@
 // RUN: grep -Ev "// *[A-Z-]+:" %s \
-// RUN:   | clang-format -style=LLVM -offset=2 -length=1 -offset=28 -length=1 -offset=35 -length=8 \
+// RUN:   | clang-format -style=LLVM -offset=2 -length=0 -offset=28 -length=0 \
 // RUN:   | FileCheck -strict-whitespace %s
 // CHECK: {{^int\ \*i;$}}
 int*i;
@@ -9,12 +9,3 @@ int  *  i;
 
 // CHECK: {{^int\ \*i;$}}
 int   *   i;
-
-// CHECK: int I;
-// CHECK-NEXT: int J ;
-int I ;
-int J ;
-
-// RUN: not clang-format -length=0 %s 2>&1 \
-// RUN:   | FileCheck -strict-whitespace -check-prefix=CHECK0 %s
-// CHECK0: error: length should be at least 1
diff --git a/clang/test/Frontend/x86_64-nacl-types.cpp b/clang/test/Frontend/x86_64-nacl-types.cpp
deleted file mode 100644
index ca200147f4013..0000000000000
--- a/clang/test/Frontend/x86_64-nacl-types.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-nacl -std=c++11 -verify %s
-// expected-no-diagnostics
-
-#include <stddef.h>
-#include <stdarg.h>
-
-static_assert(alignof(char) == 1, "alignof char is wrong");
-
-static_assert(sizeof(short) == 2, "sizeof short is wrong");
-static_assert(alignof(short) == 2, "alignof short is wrong");
-
-static_assert(sizeof(int) == 4, "sizeof int is wrong");
-static_assert(alignof(int) == 4, "alignof int is wrong");
-
-static_assert(sizeof(long) == 4, "sizeof long is wrong");
-static_assert(alignof(long) == 4, "alignof long is wrong");
-
-static_assert(sizeof(long long) == 8, "sizeof long long is wrong wrong");
-static_assert(alignof(long long) == 8, "alignof long long is wrong wrong");
-
-static_assert(sizeof(void*) == 4, "sizeof void * is wrong");
-static_assert(alignof(void*) == 4, "alignof void * is wrong");
-
-static_assert(sizeof(float) == 4, "sizeof float is wrong");
-static_assert(alignof(float) == 4, "alignof float is wrong");
-
-static_assert(sizeof(double) == 8, "sizeof double is wrong");
-static_assert(alignof(double) == 8, "alignof double is wrong");
-
-static_assert(sizeof(long double) == 8, "sizeof long double is wrong");
-static_assert(alignof(long double) == 8, "alignof long double is wrong");
-
-static_assert(sizeof(va_list) == 16, "sizeof va_list is wrong");
-static_assert(alignof(va_list) == 4, "alignof va_list is wrong");
-
-static_assert(sizeof(size_t) == 4, "sizeof size_t is wrong");
-static_assert(alignof(size_t) == 4, "alignof size_t is wrong");
diff --git a/clang/test/Headers/spirv_ids.cpp b/clang/test/Headers/spirv_ids.cpp
index 0cd74dbca53aa..466be5deee87a 100644
--- a/clang/test/Headers/spirv_ids.cpp
+++ b/clang/test/Headers/spirv_ids.cpp
@@ -53,58 +53,58 @@
 // CHECK: call i32 @llvm.spv.subgroup.id()
 // CHECK: call i32 @llvm.spv.subgroup.local.invocation.id()
   
-// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 0) #2
-// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 1) #2
-// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 2) #2
-// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 0) #2
-// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 1) #2
-// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 2) #2
-// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 0) #2
-// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 1) #2
-// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 2) #2
-// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 0) #2
-// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 1) #2
-// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 2) #2
-// NV: call noundef i32 @_Z20__spirv_SubgroupSizev() #2
-// NV: call noundef i32 @_Z23__spirv_SubgroupMaxSizev() #2
-// NV: call noundef i32 @_Z20__spirv_NumSubgroupsv() #2
-// NV: call noundef i32 @_Z18__spirv_SubgroupIdv() #2
-// NV: call noundef i32 @_Z33__spirv_SubgroupLocalInvocationIdv() #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 0) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 1) #2
+// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 2) #2
+// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 0) #2
+// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 1) #2
+// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 2) #2
+// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 0) #2
+// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 1) #2
+// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 2) #2
+// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 0) #2
+// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 1) #2
+// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 2) #2
+// NV: call noundef i32 @_Z27__spirv_BuiltInSubgroupSizev() #2
+// NV: call noundef i32 @_Z30__spirv_BuiltInSubgroupMaxSizev() #2
+// NV: call noundef i32 @_Z27__spirv_BuiltInNumSubgroupsv() #2
+// NV: call noundef i32 @_Z25__spirv_BuiltInSubgroupIdv() #2
+// NV: call noundef i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv() #2
 
 void test_id_and_range() {
-  __spirv_NumWorkgroups(0);
-  __spirv_NumWorkgroups(1);
-  __spirv_NumWorkgroups(2);
-  __spirv_WorkgroupSize(0);
-  __spirv_WorkgroupSize(1);
-  __spirv_WorkgroupSize(2);
-  __spirv_WorkgroupId(0);
-  __spirv_WorkgroupId(1);
-  __spirv_WorkgroupId(2);
-  __spirv_LocalInvocationId(0);
-  __spirv_LocalInvocationId(1);
-  __spirv_LocalInvocationId(2);
-  __spirv_GlobalInvocationId(0);
-  __spirv_GlobalInvocationId(1);
-  __spirv_GlobalInvocationId(2);
-  __spirv_GlobalSize(0);
-  __spirv_GlobalSize(1);
-  __spirv_GlobalSize(2);
-  __spirv_GlobalOffset(0);
-  __spirv_GlobalOffset(1);
-  __spirv_GlobalOffset(2);
-  unsigned int ssize = __spirv_SubgroupSize();
-  unsigned int smax = __spirv_SubgroupMaxSize();
-  unsigned int snum = __spirv_NumSubgroups();
-  unsigned int sid = __spirv_SubgroupId();
-  unsigned int sinvocid = __spirv_SubgroupLocalInvocationId();
+  __spirv_BuiltInNumWorkgroups(0);
+  __spirv_BuiltInNumWorkgroups(1);
+  __spirv_BuiltInNumWorkgroups(2);
+  __spirv_BuiltInWorkgroupSize(0);
+  __spirv_BuiltInWorkgroupSize(1);
+  __spirv_BuiltInWorkgroupSize(2);
+  __spirv_BuiltInWorkgroupId(0);
+  __spirv_BuiltInWorkgroupId(1);
+  __spirv_BuiltInWorkgroupId(2);
+  __spirv_BuiltInLocalInvocationId(0);
+  __spirv_BuiltInLocalInvocationId(1);
+  __spirv_BuiltInLocalInvocationId(2);
+  __spirv_BuiltInGlobalInvocationId(0);
+  __spirv_BuiltInGlobalInvocationId(1);
+  __spirv_BuiltInGlobalInvocationId(2);
+  __spirv_BuiltInGlobalSize(0);
+  __spirv_BuiltInGlobalSize(1);
+  __spirv_BuiltInGlobalSize(2);
+  __spirv_BuiltInGlobalOffset(0);
+  __spirv_BuiltInGlobalOffset(1);
+  __spirv_BuiltInGlobalOffset(2);
+  unsigned int ssize = __spirv_BuiltInSubgroupSize();
+  unsigned int smax = __spirv_BuiltInSubgroupMaxSize();
+  unsigned int snum = __spirv_BuiltInNumSubgroups();
+  unsigned int sid = __spirv_BuiltInSubgroupId();
+  unsigned int sinvocid = __spirv_BuiltInSubgroupLocalInvocationId();
 }
diff --git a/clang/test/Misc/warning-wall.c b/clang/test/Misc/warning-wall.c
index 91de843f88c91..689868c62f6a7 100644
--- a/clang/test/Misc/warning-wall.c
+++ b/clang/test/Misc/warning-wall.c
@@ -66,6 +66,7 @@ CHECK-NEXT:    -Wuninitialized
 CHECK-NEXT:      -Wsometimes-uninitialized
 CHECK-NEXT:      -Wstatic-self-init
 CHECK-NEXT:      -Wuninitialized-const-reference
+CHECK-NEXT:      -Wuninitialized-const-pointer
 CHECK-NEXT:    -Wunknown-pragmas
 CHECK-NEXT:    -Wunused
 CHECK-NEXT:      -Wunused-argument
diff --git a/clang/test/Modules/cxx20-10-2-ex1.cpp b/clang/test/Modules/cxx20-10-2-ex1.cpp
index 0cd6f77466f4b..749b15213098a 100644
--- a/clang/test/Modules/cxx20-10-2-ex1.cpp
+++ b/clang/test/Modules/cxx20-10-2-ex1.cpp
@@ -14,7 +14,7 @@ export int x;
 module;
 
 #include "std-10-2-ex1.h"
-// expected-error@std-10-2-ex1.h:* {{export declaration can only be used within a module purview}}
+// expected-error@std-10-2-ex1.h:* {{export declaration can only be used within a module interface}}
 
 export module M1;
 export namespace {} // expected-error {{anonymous namespaces cannot be exported}}
diff --git a/clang/test/Modules/cxx20-export-import.cpp b/clang/test/Modules/cxx20-export-import.cpp
index 0b505668e8589..c14883e575575 100644
--- a/clang/test/Modules/cxx20-export-import.cpp
+++ b/clang/test/Modules/cxx20-export-import.cpp
@@ -11,4 +11,4 @@
 export module dummy;
 
 //--- test.cpp
-export import dummy; // expected-error {{export declaration can only be used within a module purview}}
+export import dummy; // expected-error {{export declaration can only be used within a module interface}}
diff --git a/clang/test/Modules/cxx20-import-diagnostics-a.cpp b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
index 1b38259e0358c..72a31ea1d7d78 100644
--- a/clang/test/Modules/cxx20-import-diagnostics-a.cpp
+++ b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
@@ -110,7 +110,7 @@ module;
 
 module AOK1;
 
-export import C; // expected-error {{export declaration can only be used within a module purview}}
+export import C; // expected-error {{export declaration can only be used within a module interface}}
 
 int theAnswer () { return 42; }
 
diff --git a/clang/test/Modules/export-in-non-modules.cpp b/clang/test/Modules/export-in-non-modules.cpp
index 69360eb46d774..7b2575c60f1fd 100644
--- a/clang/test/Modules/export-in-non-modules.cpp
+++ b/clang/test/Modules/export-in-non-modules.cpp
@@ -1,4 +1,4 @@
 // RUN: %clang_cc1 -std=c++20 %s -fsyntax-only -verify
-export struct Unit { // expected-error {{export declaration can only be used within a module purview}}
+export struct Unit { // expected-error {{export declaration can only be used within a module interface}}
   bool operator<(const Unit &);
 };
diff --git a/clang/test/PCH/no-validate-pch.cl b/clang/test/PCH/no-validate-pch.cl
index f6a7c65461fe1..ce3a9095fe2e6 100644
--- a/clang/test/PCH/no-validate-pch.cl
+++ b/clang/test/PCH/no-validate-pch.cl
@@ -16,7 +16,7 @@
 // CHECK: note: previous definition is here
 // CHECK: #define X 4
 
-// CHECK-VAL: error: __OPTIMIZE__ predefined macro was enabled in precompiled file '{{.*}}' but is currently disabled
+// CHECK-VAL: error: OptimizationLevel differs in precompiled file '{{.*}}' vs. current file
 // CHECK-VAL: error: definition of macro 'X' differs between the precompiled file '{{.*}}' ('4') and the command line ('5')
 
 void test(void) {
diff --git a/clang/test/Parser/gh114815.cpp b/clang/test/Parser/gh114815.cpp
new file mode 100644
index 0000000000000..6a89384e9e66d
--- /dev/null
+++ b/clang/test/Parser/gh114815.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -verify %s -std=c++11 -fsyntax-only
+
+#define ID(X) X
+extern int ID(decltype);
+// expected-error@-1 {{expected '(' after 'decltype'}} \
+// expected-error@-1 {{expected unqualified-id}}
diff --git a/clang/test/Preprocessor/openacc.c b/clang/test/Preprocessor/openacc.c
index be7052f00e0ce..283baa6c2fe4b 100644
--- a/clang/test/Preprocessor/openacc.c
+++ b/clang/test/Preprocessor/openacc.c
@@ -1,13 +1,9 @@
 // RUN: %clang_cc1 -E -fopenacc %s | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang_cc1 -E -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=OVERRIDE
 
-// DEFAULT: OpenACC:1:
-// OVERRIDE: OpenACC:202211:
+// DEFAULT: OpenACC:202506:
 OpenACC:_OPENACC:
 
 // RUN: %clang_cc1 -E -dM -fopenacc %s | FileCheck %s --check-prefix=MACRO_PRINT_DEF
-// RUN: %clang_cc1 -E -dM -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=MACRO_PRINT_OVR
-// MACRO_PRINT_DEF: #define _OPENACC 1
-// MACRO_PRINT_OVR: #define _OPENACC 202211
+// MACRO_PRINT_DEF: #define _OPENACC 202506
 
 
diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c
index fe27ed8814eec..9a61ac33ee420 100644
--- a/clang/test/Preprocessor/predefined-macros-no-warnings.c
+++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c
@@ -34,7 +34,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple arm-netbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple arm-openbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple arm-rtems
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple arm-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple arm-win32-cygnus
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple arm-win32-gnu
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple arm-win32-itanium
@@ -60,7 +59,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple mipsel-rtems
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple mipsel-freebsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple mipsel-netbsd
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple mipsel-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple mips64
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple mips64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple mips64-rtems
@@ -147,7 +145,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple i686-win32-msvc
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple i686-haiku
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple i686-rtems
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple i686-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple i686-elfiamcu
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple i686-hurd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64
@@ -166,7 +163,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-win32gnu
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-win32msvc
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-haiku
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps4
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps5
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-managarm
diff --git a/clang/test/Preprocessor/riscv-target-features-andes.c b/clang/test/Preprocessor/riscv-target-features-andes.c
index 083deb02a2679..f7981bb52de6d 100644
--- a/clang/test/Preprocessor/riscv-target-features-andes.c
+++ b/clang/test/Preprocessor/riscv-target-features-andes.c
@@ -4,6 +4,7 @@
 // RUN:   -o - | FileCheck %s
 
 // CHECK-NOT: __riscv_xandesperf {{.*$}}
+// CHECK-NOT: __riscv_xandesbfhcvt {{.*$}}
 // CHECK-NOT: __riscv_xandesvbfhcvt {{.*$}}
 // CHECK-NOT: __riscv_xandesvsintload {{.*$}}
 // CHECK-NOT: __riscv_xandesvpackfph {{.*$}}
@@ -17,6 +18,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESPERF %s
 // CHECK-XANDESPERF: __riscv_xandesperf  5000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN:   -march=rv32i_xandesbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESBFHCVT %s
+// RUN: %clang --target=riscv64 \
+// RUN:   -march=rv64i_xandesbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESBFHCVT %s
+// CHECK-XANDESBFHCVT: __riscv_xandesbfhcvt  5000000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32i_xandesvbfhcvt -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVBFHCVT %s
diff --git a/clang/test/Sema/attr-nonblocking-sema.cpp b/clang/test/Sema/attr-nonblocking-sema.cpp
index f13cc783dfc33..c8fb40693eec0 100644
--- a/clang/test/Sema/attr-nonblocking-sema.cpp
+++ b/clang/test/Sema/attr-nonblocking-sema.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -verify -Wfunction-effects %s
-// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -x c -std=c23 -Wfunction-effects %s
+// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -verify -Wfunction-effects -Wfunction-effect-redeclarations %s
+// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -x c -std=c23 -Wfunction-effects -Wfunction-effect-redeclarations %s
 
 #if !__has_attribute(nonblocking)
 #error "the 'nonblocking' attribute is not available"
@@ -127,29 +127,35 @@ void type_conversions_2()
 #endif
 
 // --- VIRTUAL METHODS ---
-// Attributes propagate to overridden methods, so no diagnostics except for conflicts.
+// Attributes propagate to overridden methods.
 // Check this in the syntax tests too.
 #ifdef __cplusplus
 struct Base {
   virtual void f1();
-  virtual void nonblocking() noexcept [[clang::nonblocking]];
-  virtual void nonallocating() noexcept [[clang::nonallocating]];
+  virtual void nonblocking() noexcept [[clang::nonblocking]]; // expected-note {{overridden virtual function is here}}
+  virtual void nonallocating() noexcept [[clang::nonallocating]]; // expected-note {{overridden virtual function is here}}
   virtual void f2() [[clang::nonallocating]]; // expected-note {{previous declaration is here}}
+  virtual void f3() [[clang::nonblocking]]; // expected-note {{overridden virtual function is here}}
 };
 
 struct Derived : public Base {
   void f1() [[clang::nonblocking]] override;
-  void nonblocking() noexcept override;
-  void nonallocating() noexcept override;
+  void nonblocking() noexcept override; // expected-warning {{overriding function is missing 'nonblocking' attribute from base declaration}}
+  void nonallocating() noexcept override; // expected-warning {{overriding function is missing 'nonallocating' attribute from base declaration}}
   void f2() [[clang::allocating]] override; // expected-warning {{effects conflict when merging declarations; kept 'allocating', discarded 'nonallocating'}}
 };
+
+template <bool B>
+struct TDerived : public Base {
+  void f3() [[clang::nonblocking(B)]] override; // expected-warning {{attribute 'nonblocking' on overriding function conflicts with base declaration}}
+};
 #endif // __cplusplus
 
 // --- REDECLARATIONS ---
 
 void f2();
 void f2() [[clang::nonblocking]]; // expected-note {{previous declaration is here}}
-void f2(); // expected-warning {{attribute 'nonblocking' on function does not match previous declaration}}
+void f2(); // expected-warning {{redeclaration is missing 'nonblocking' attribute from previous declaration}}
 // Note: we verify that the attribute is actually seen during the constraints tests.
 
 void f3() [[clang::blocking]]; // expected-note {{previous declaration is here}}
diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c
index e358aceaad5a4..87c21120e7c5d 100644
--- a/clang/test/Sema/const-eval.c
+++ b/clang/test/Sema/const-eval.c
@@ -32,7 +32,7 @@ void f(void)
 _Complex float g16 = (1.0f + 1.0fi);
 
 // ?: in constant expressions.
-int g17[(3?:1) - 2]; 
+int g17[(3?:1) - 2];
 
 EVAL_EXPR(18, ((int)((void*)10 + 10)) == 20 ? 1 : -1);
 
@@ -41,6 +41,9 @@ struct s {
 };
 
 EVAL_EXPR(19, ((int)&*(char*)10 == 10 ? 1 : -1));
+// expected-error@-1 {{not an integer constant expression}} \
+// expected-note@-1 {{dereferencing a null pointer is not allowed in a constant expression}}
+
 
 EVAL_EXPR(20, __builtin_constant_p(*((int*) 10)));
 
diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
index 38dfdb98f08fc..0e98904ade86a 100644
--- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
+++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts,LifetimeDataflow -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts,LifetimeLoanPropagation -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
 // REQUIRES: asserts
 
 struct MyObj {
@@ -19,7 +19,7 @@ MyObj* return_local_addr() {
 // CHECK:   ReturnOfOrigin (OriginID: [[O_RET_VAL]])
 // CHECK:   Expire (LoanID: [[L_X]])
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_ADDR_X]] contains Loan [[L_X]]
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_X]]
 // CHECK-DAG: Origin [[O_RET_VAL]] contains Loan [[L_X]]
@@ -47,7 +47,7 @@ MyObj* assign_and_return_local_addr() {
 // CHECK: ReturnOfOrigin (OriginID: [[O_PTR2_RVAL_2]])
 // CHECK: Expire (LoanID: [[L_Y]])
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_ADDR_Y]] contains Loan [[L_Y]]
 // CHECK-DAG: Origin [[O_PTR1]] contains Loan [[L_Y]]
 // CHECK-DAG: Origin [[O_PTR2]] contains Loan [[L_Y]]
@@ -65,7 +65,7 @@ int return_int_val() {
   return x;
 }
 // CHECK-NEXT: End of Block
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK:  <empty>
 
 
@@ -79,7 +79,7 @@ void loan_expires_cpp() {
 // CHECK: AssignOrigin (DestID: [[O_POBJ:[0-9]+]], SrcID: [[O_ADDR_OBJ]])
 // CHECK: Expire (LoanID: [[L_OBJ]])
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_ADDR_OBJ]] contains Loan [[L_OBJ]]
 // CHECK-DAG: Origin [[O_POBJ]] contains Loan [[L_OBJ]]
 
@@ -96,7 +96,7 @@ void loan_expires_trivial() {
 // CHECK-NEXT: End of Block
   // FIXME: Add check for Expire once trivial destructors are handled for expiration.
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_ADDR_TRIVIAL_OBJ]] contains Loan [[L_TRIVIAL_OBJ]]
 // CHECK-DAG: Origin [[O_PTOBJ]] contains Loan [[L_TRIVIAL_OBJ]]
 
@@ -119,7 +119,7 @@ void conditional(bool condition) {
   // CHECK: AssignOrigin (DestID: [[O_P_RVAL:[0-9]+]], SrcID: [[O_P]])
   // CHECK: AssignOrigin (DestID: [[O_Q:[0-9]+]], SrcID: [[O_P_RVAL]])
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_ADDR_A]] contains Loan [[L_A]]
 // CHECK-DAG: Origin [[O_ADDR_B]] contains Loan [[L_B]]
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_A]]
@@ -163,7 +163,7 @@ void pointers_in_a_cycle(bool condition) {
 }
 // At the end of the analysis, the origins for the pointers involved in the cycle
 // (p1, p2, p3, temp) should all contain the loans from v1, v2, and v3 at the fixed point.
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V1]]
 // CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V2]]
 // CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V3]]
@@ -195,7 +195,7 @@ void overwrite_origin() {
 // CHECK:   Expire (LoanID: [[L_S2]])
 // CHECK:   Expire (LoanID: [[L_S1]])
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK:     Origin [[O_P]] contains Loan [[L_S2]]
 // CHECK-NOT: Origin [[O_P]] contains Loan [[L_S1]]
 
@@ -213,7 +213,7 @@ void reassign_to_null() {
 }
 // FIXME: Have a better representation for nullptr than just an empty origin. 
 //        It should be a separate loan and origin kind.
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK: Origin [[O_P]] contains no loans
 
 
@@ -235,7 +235,7 @@ void reassign_in_if(bool condition) {
 // CHECK:   Expire (LoanID: [[L_S2]])
 // CHECK:   Expire (LoanID: [[L_S1]])
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]]
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]]
 // CHECK-DAG: Origin [[O_ADDR_S1]] contains Loan [[L_S1]]
@@ -276,7 +276,7 @@ void assign_in_switch(int mode) {
 // CHECK-DAG:   Expire (LoanID: [[L_S2]])
 // CHECK-DAG:   Expire (LoanID: [[L_S1]])
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]]
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]]
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S3]]
@@ -299,7 +299,7 @@ void loan_in_loop(bool condition) {
 // CHECK:   Expire (LoanID: [[L_INNER]])
   }
 }
-// CHECK: Dataflow results:
+// CHECK: LoanPropagation results:
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_INNER]]
 // CHECK-DAG: Origin [[O_ADDR_INNER]] contains Loan [[L_INNER]]
 
@@ -326,7 +326,7 @@ void loop_with_break(int count) {
 // CHECK:   Expire (LoanID: [[L_S1]])
 }
 
-// CHECK-LABEL: Dataflow results:
+// CHECK-LABEL: LoanPropagation results:
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]]
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]]
 // CHECK-DAG: Origin [[O_ADDR_S1]] contains Loan [[L_S1]]
@@ -355,7 +355,7 @@ void nested_scopes() {
 // CHECK:   Expire (LoanID: [[L_OUTER]])
 }
 
-// CHECK-LABEL: Dataflow results:
+// CHECK-LABEL: LoanPropagation results:
 // CHECK-DAG: Origin [[O_P]] contains Loan [[L_INNER]]
 // CHECK-DAG: Origin [[O_ADDR_INNER]] contains Loan [[L_INNER]]
 // CHECK-DAG: Origin [[O_ADDR_OUTER]] contains Loan [[L_OUTER]]
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index c390fee1c38d9..5ecb8c607f59a 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1413,8 +1413,8 @@ namespace ComplexConstexpr {
   static_assert(t2p[2] == 0.0, ""); // expected-error {{constant expr}} expected-note {{one-past-the-end pointer}}
   static_assert(t2p[3] == 0.0, ""); // expected-error {{constant expr}} expected-note {{cannot refer to element 3 of array of 2 elements}}
   constexpr _Complex float *p = 0; // expected-warning {{'_Complex' is a C99 extension}}
-  constexpr float pr = __real *p; // expected-error {{constant expr}} expected-note {{cannot access real component of null}}
-  constexpr float pi = __imag *p; // expected-error {{constant expr}} expected-note {{cannot access imaginary component of null}}
+  constexpr float pr = __real *p; // expected-error {{constant expr}} expected-note {{dereferencing a null pointer}}
+  constexpr float pi = __imag *p; // expected-error {{constant expr}} expected-note {{dereferencing a null pointer}}
   constexpr const _Complex double *q = &test3 + 1; // expected-warning {{'_Complex' is a C99 extension}}
   constexpr double qr = __real *q; // expected-error {{constant expr}} expected-note {{cannot access real component of pointer past the end}}
   constexpr double qi = __imag *q; // expected-error {{constant expr}} expected-note {{cannot access imaginary component of pointer past the end}}
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index e16a69df3830d..182c0d01141ff 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -265,7 +265,7 @@ namespace const_modify {
 
 namespace null {
   constexpr int test(int *p) {
-    return *p = 123; // expected-note {{assignment to dereferenced null pointer}}
+    return *p = 123; // expected-note {{dereferencing a null pointer}}
   }
   static_assert(test(0), ""); // expected-error {{constant expression}} expected-note {{in call}}
 }
@@ -1321,3 +1321,127 @@ constexpr bool check = different_in_loop();
   // expected-error@-1 {{}} expected-note@-1 {{in call}}
 
 }
+
+namespace comparison_dead_variable {
+  constexpr bool f() {
+    int *p1 = 0, *p2 = 0;
+    {
+        int x = 0; p1 = &x;
+    }
+    {
+        int x = 0; p2 = &x;
+    }
+    return p1 != p2;
+  }
+  // FIXME: This should fail.
+  static_assert(f(),"");
+
+}
+namespace GH48665 {
+constexpr bool foo(int *i) {
+    int &j = *i;
+    // expected-note@-1 {{dereferencing a null pointer}}
+    return true;
+}
+
+static_assert(foo(nullptr), ""); // expected-note {{in call to 'foo(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+constexpr bool foo_rvalue(int *i) {
+    int &&j = (int&&)*i;
+    // expected-note@-1 {{dereferencing a null pointer}}
+    return true;
+}
+static_assert(foo_rvalue(nullptr), ""); // expected-note {{in call to 'foo_rvalue(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+int arr[3]; // expected-note {{declared here}}
+constexpr bool f() { // cxx14_20-error {{constexpr function never produces a constant expression}}
+  int &r  = arr[3]; // expected-note {{read of dereferenced one-past-the-end pointer}} \
+                    // cxx14_20-note {{read of dereferenced one-past-the-end pointer}} \
+                    // expected-warning {{array index 3 is past the end of the array}}
+  return true;
+}
+static_assert(f(), ""); // expected-note {{in call to 'f()'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+
+struct Aggregate {
+   int &r;
+};
+constexpr bool test_agg(int *i) {
+   Aggregate a{*i}; //expected-note {{dereferencing a null pointer}}
+   return true;
+}
+static_assert(test_agg(nullptr), ""); // expected-note {{in call to 'test_agg(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+struct B {
+  constexpr B(int *p) : r{*p} {}  // expected-note {{dereferencing a null pointer}}
+  int &r;
+};
+
+constexpr bool test_ctr(int *i) {
+    B b(i); // expected-note {{in call to 'B(nullptr)'}}
+    return true;
+}
+
+static_assert(test_ctr(nullptr), ""); // expected-note {{in call to 'test_ctr(nullptr)'}}
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+
+
+// verify that we can dereference function pointers
+namespace functions {
+
+constexpr int f() {return 0;}
+constexpr int(*f_ptr)() = &f;
+constexpr int(*null_ptr)() = nullptr;
+
+constexpr int(&f_ref)() = f;
+constexpr int test = (*f_ptr)();
+constexpr int test2 = (*f_ref)();
+constexpr int test3 = (*f_ref)();
+constexpr int test4 = (*null_ptr)();
+//expected-error@-1 {{constexpr variable 'test4' must be initialized by a constant expression}} \
+//expected-note@-1 {{'(*null_ptr)' evaluates to a null function pointer}}
+
+constexpr int(*f_ptr_arr[1])() = {&f};
+constexpr int test_array_ok = (f_ptr_arr[0])();
+constexpr int test_array_err = (f_ptr_arr[1])();
+// expected-error@-1 {{constexpr variable 'test_array_err' must be initialized by a constant expression}} \
+// expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+
+struct S {
+    int(*f_ptr)() = &f;
+    int(*f_ptr_arr[1])() = {&f};
+    int(&f_ref)() = f;
+    int(*null_ptr)() = nullptr;
+};
+
+constexpr int test_member() {
+    S s {};
+    (*s.f_ptr)();
+    (*s.f_ref)();
+    (s.f_ref)();
+    (s.f_ptr_arr[0])();
+    (s.f_ptr_arr[1])();
+    // expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+    return 0;
+}
+constexpr int test_member_null() { // cxx14_20-error {{never produces a constant expression}}
+    S s {};
+    (*s.null_ptr)(); // expected-note {{'(*s.null_ptr)' evaluates to a null function pointer}} \
+                     // cxx14_20-note {{'(*s.null_ptr)' evaluates to a null function pointer}}
+    return 0;
+}
+
+static_assert(test_member(), "");
+// expected-error@-1 {{static assertion expression is not an integral constant expression}} \
+// expected-note@-1 {{in call to 'test_member()'}}
+
+static_assert(test_member_null(), "");
+// expected-error@-1 {{static assertion expression is not an integral constant expression}} \
+// expected-note@-1 {{in call to 'test_member_null()'}}
+
+}
+}
diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
index 85720606fe9de..ffb7e633c2919 100644
--- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
@@ -927,7 +927,7 @@ namespace dynamic_alloc {
   constexpr void use_after_free() { // expected-error {{never produces a constant expression}}
     int *p = new int;
     delete p;
-    *p = 1; // expected-note {{assignment to heap allocated object that has been deleted}}
+    *p = 1; // expected-note {{read of heap allocated object that has been deleted}}
   }
   constexpr void use_after_free_2() { // expected-error {{never produces a constant expression}}
     struct X { constexpr void f() {} };
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index dffb386f530f4..16f5f823d26c1 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -319,7 +319,7 @@ namespace casting {
 }
 
 namespace pointer_comparisons {
-  extern int &extern_n; // interpreter-note 2 {{declared here}}
+  extern int &extern_n; // interpreter-note 4 {{declared here}}
   extern int &extern_n2;
   constexpr int f1(bool b, int& n) {
     if (b) {
@@ -330,14 +330,56 @@ namespace pointer_comparisons {
   // FIXME: interpreter incorrectly rejects; both sides are the same constexpr-unknown value.
   static_assert(f1(false, extern_n)); // interpreter-error {{static assertion expression is not an integral constant expression}} \
                                       // interpreter-note {{initializer of 'extern_n' is unknown}}
-  // FIXME: We should diagnose this: we don't know if the references bind
-  // to the same object.
-  static_assert(&extern_n != &extern_n2); // interpreter-error {{static assertion expression is not an integral constant expression}} \
+  static_assert(&extern_n != &extern_n2); // expected-error {{static assertion expression is not an integral constant expression}} \
+                                          // nointerpreter-note {{comparison between pointers to unrelated objects '&extern_n' and '&extern_n2' has unspecified value}} \
                                           // interpreter-note {{initializer of 'extern_n' is unknown}}
   void f2(const int &n) {
-    // FIXME: We should not diagnose this: the two objects provably have
-    // different addresses because the lifetime of "n" extends across
-    // the initialization.
-    constexpr int x = &x == &n; // nointerpreter-error {{must be initialized by a constant expression}}
+    constexpr int x = &x == &n; // nointerpreter-error {{must be initialized by a constant expression}} \
+                                // nointerpreter-note {{comparison between pointers to unrelated objects '&x' and '&n' has unspecified value}}
+    // Distinct variables are not equal, even if they're local variables.
+    constexpr int y = &x == &y;
+    static_assert(!y);
   }
+  constexpr int f3() {
+    int x;
+    return &x == &extern_n; // nointerpreter-note {{comparison between pointers to unrelated objects '&x' and '&extern_n' has unspecified value}} \
+                            // interpreter-note {{initializer of 'extern_n' is unknown}}
+  }
+  static_assert(!f3()); // expected-error {{static assertion expression is not an integral constant expression}} \
+                        // expected-note {{in call to 'f3()'}}
+  constexpr int f4() {
+    int *p = new int;
+    bool b = p == &extern_n; // nointerpreter-note {{comparison between pointers to unrelated objects '&{*new int#0}' and '&extern_n' has unspecified value}} \
+                             // interpreter-note {{initializer of 'extern_n' is unknown}}
+    delete p;
+    return b;
+  }
+  static_assert(!f4()); // expected-error {{static assertion expression is not an integral constant expression}} \
+                        // expected-note {{in call to 'f4()'}}
+}
+
+namespace GH149188 {
+namespace enable_if_1 {
+  template <__SIZE_TYPE__ N>
+  constexpr void foo(const char (&Str)[N])
+  __attribute((enable_if(__builtin_strlen(Str), ""))) {}
+
+  void x() {
+      foo("1234");
+  }
+}
+
+namespace enable_if_2 {
+  constexpr const char (&f())[];
+  extern const char (&Str)[];
+  constexpr int foo()
+  __attribute((enable_if(__builtin_strlen(Str), "")))
+  {return __builtin_strlen(Str);}
+
+  constexpr const char (&f())[] {return "a";}
+  constexpr const char (&Str)[] = f();
+  void x() {
+      constexpr int x = foo();
+  }
+}
 }
diff --git a/clang/test/SemaCXX/constexpr-backtrace-limit.cpp b/clang/test/SemaCXX/constexpr-backtrace-limit.cpp
index e867afdff5c3c..f0c1206a4b8d3 100644
--- a/clang/test/SemaCXX/constexpr-backtrace-limit.cpp
+++ b/clang/test/SemaCXX/constexpr-backtrace-limit.cpp
@@ -15,14 +15,14 @@
 
 // RUN: not %clang_cc1 -std=c++11 -fsyntax-only %s -fconstexpr-backtrace-limit=2 -fconstexpr-depth=8 -fno-caret-diagnostics 2>&1 | FileCheck %s -check-prefix=TEST3
 // TEST3: constant expression
-// TEST3-NEXT: reinterpret_cast
+// TEST3-NEXT: dereferencing a null pointer
 // TEST3-NEXT: in call to 'recurse(0)'
 // TEST3-NEXT: skipping 4 calls
 // TEST3-NEXT: in call to 'recurse(5)'
 
 // RUN: not %clang_cc1 -std=c++11 -fsyntax-only %s -fconstexpr-backtrace-limit=8 -fconstexpr-depth=8 -fno-caret-diagnostics 2>&1 | FileCheck %s -check-prefix=TEST4
 // TEST4: constant expression
-// TEST4-NEXT: reinterpret_cast
+// TEST4-NEXT: dereferencing a null pointer
 // TEST4-NEXT: in call to 'recurse(0)'
 // TEST4-NEXT: in call to 'recurse(1)'
 // TEST4-NEXT: in call to 'recurse(2)'
diff --git a/clang/test/SemaCXX/constexpr-never-constant.cpp b/clang/test/SemaCXX/constexpr-never-constant.cpp
index 307810ee263dd..5756bb647ce88 100644
--- a/clang/test/SemaCXX/constexpr-never-constant.cpp
+++ b/clang/test/SemaCXX/constexpr-never-constant.cpp
@@ -24,3 +24,10 @@ constexpr void other_func() {
 
   throw 12;
 }
+
+namespace GH149041 {
+  // Make sure these don't trigger the diagnostic.
+  extern const bool& b;
+  constexpr bool fun1() { return b; }
+  constexpr bool fun2(const bool& b) { return b; }
+}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 6987d0c020457..2253cbb26285e 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -1347,3 +1347,13 @@ int main() {
     // expected-note@#S3-f-cand2 {{candidate function not viable: no known conversion from 'S3' to 'int' for object argument}}
 }
 }
+
+namespace GH113185 {
+
+void Bar(this int) { // expected-note {{candidate function}}
+    // expected-error@-1 {{an explicit object parameter cannot appear in a non-member function}}
+    Bar(0);
+    Bar(); // expected-error {{no matching function for call to 'Bar'}}
+}
+
+}
diff --git a/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp
index 6f6f9b04aa392..4cf0e9ffe1d64 100644
--- a/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp
+++ b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
+// RUN: %clang_cc1 -std=c++2c -verify %s -fexperimental-new-constant-interpreter
 
 
 namespace std {
diff --git a/clang/test/SemaCXX/noreturn-vars.cpp b/clang/test/SemaCXX/noreturn-vars.cpp
new file mode 100644
index 0000000000000..ca65fcf5ca31d
--- /dev/null
+++ b/clang/test/SemaCXX/noreturn-vars.cpp
@@ -0,0 +1,227 @@
+// RUN: %clang_cc1 -fsyntax-only %s -verify
+
+[[noreturn]] extern void noret();
+[[noreturn]] extern void noret2();
+extern void ordinary();
+
+typedef void (*func_type)(void);
+
+// Constant initialization.
+
+void (* const const_fptr)() = noret;
+[[noreturn]] void test_global_const() {
+  const_fptr();
+}
+
+const func_type const_fptr_cast = (func_type)noret2;
+[[noreturn]] void test_global_cast() {
+  const_fptr_cast();
+}
+
+void (* const const_fptr_list)() = {noret};
+[[noreturn]] void test_global_list() {
+  const_fptr_list();
+}
+
+const func_type const_fptr_fcast = func_type(noret2);
+[[noreturn]] void test_global_fcast() {
+  const_fptr_fcast();
+}
+
+[[noreturn]] void test_local_const() {
+  void (* const fptr)() = noret;
+  fptr();
+}
+
+// Global variable assignment.
+void (*global_fptr)() = noret;
+
+[[noreturn]] void test_global_noassign() {
+  global_fptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_global_assign() {
+  global_fptr = noret;
+  global_fptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+// Local variable assignment.
+
+[[noreturn]] void test_init() {
+  func_type func_ptr = noret;
+  func_ptr();
+}
+
+[[noreturn]] void test_assign() {
+  void (*func_ptr)(void);
+  func_ptr = noret;
+  func_ptr();
+}
+
+[[noreturn]] void test_override() {
+  func_type func_ptr;
+  func_ptr = ordinary;
+  func_ptr = noret;
+  func_ptr();
+}
+
+[[noreturn]] void test_if_all(int x) {
+  func_type func_ptr;
+  if (x > 0)
+    func_ptr = noret;
+  else
+    func_ptr = noret2;
+  func_ptr();
+}
+
+[[noreturn]] void test_if_mix(int x) {
+  func_type func_ptr;
+  if (x > 0)
+    func_ptr = noret;
+  else
+    func_ptr = ordinary;
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_if_opt(int x) {
+  func_type func_ptr = noret;
+  if (x > 0)
+    func_ptr = ordinary;
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_if_opt2(int x) {
+  func_type func_ptr = ordinary;
+  if (x > 0)
+    func_ptr = noret;
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_if_nest_all(int x, int y) {
+  func_type func_ptr;
+  if (x > 0) {
+    if (y > 0)
+      func_ptr = noret;
+    else
+      func_ptr = noret2;
+  } else {
+    if (y < 0)
+      func_ptr = noret2;
+    else
+      func_ptr = noret;
+  }
+  func_ptr();
+}
+
+[[noreturn]] void test_if_nest_mix(int x, int y) {
+  func_type func_ptr;
+  if (x > 0) {
+    if (y > 0)
+      func_ptr = noret;
+    else
+      func_ptr = noret2;
+  } else {
+    if (y < 0)
+      func_ptr = ordinary;
+    else
+      func_ptr = noret;
+  }
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_switch_all(int x) {
+  func_type func_ptr;
+  switch(x) {
+  case 1:
+    func_ptr = noret;
+    break;
+  default:
+    func_ptr = noret2;
+    break;
+  }
+  func_ptr();
+}
+
+[[noreturn]] void test_switch_mix(int x) {
+  func_type func_ptr;
+  switch(x) {
+  case 1:
+    func_ptr = ordinary;
+    break;
+  default:
+    func_ptr = noret;
+    break;
+  }
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_switch_fall(int x) {
+  func_type func_ptr;
+  switch(x) {
+  case 1:
+    func_ptr = ordinary;
+  default:
+    func_ptr = noret;
+    break;
+  }
+  func_ptr();
+}
+
+[[noreturn]] void test_switch_all_nest(int x, int y) {
+  func_type func_ptr;
+  switch(x) {
+  case 1:
+    func_ptr = noret;
+    break;
+  default:
+    if (y > 0)
+      func_ptr = noret2;
+    else
+      func_ptr = noret;
+    break;
+  }
+  func_ptr();
+}
+
+[[noreturn]] void test_switch_mix_nest(int x, int y) {
+  func_type func_ptr;
+  switch(x) {
+  case 1:
+    func_ptr = noret;
+    break;
+  default:
+    if (y > 0)
+      func_ptr = noret2;
+    else
+      func_ptr = ordinary;
+    break;
+  }
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+// Function parameters.
+
+[[noreturn]] void test_param(void (*func_ptr)() = noret) {
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_const_param(void (* const func_ptr)() = noret) {
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+// Escaped value.
+
+extern void abc_01(func_type &);
+extern void abc_02(func_type *);
+
+[[noreturn]] void test_escape_ref() {
+  func_type func_ptr = noret;
+  abc_01(func_ptr);
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
+
+[[noreturn]] void test_escape_addr() {
+  func_type func_ptr = noret;
+  abc_02(&func_ptr);
+  func_ptr();
+} // expected-warning {{function declared 'noreturn' should not return}}
diff --git a/clang/test/SemaCXX/type-aware-class-scoped-mismatched-constraints.cpp b/clang/test/SemaCXX/type-aware-class-scoped-mismatched-constraints.cpp
index 57e6d953c2ad6..4ccc256d63629 100644
--- a/clang/test/SemaCXX/type-aware-class-scoped-mismatched-constraints.cpp
+++ b/clang/test/SemaCXX/type-aware-class-scoped-mismatched-constraints.cpp
@@ -1,13 +1,15 @@
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -fexceptions -fcxx-exceptions    -fsized-deallocation    -faligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=1
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -fexceptions -fcxx-exceptions -fno-sized-deallocation    -faligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=0
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -fexceptions -fcxx-exceptions    -fsized-deallocation -fno-aligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=1
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -fexceptions -fcxx-exceptions -fno-sized-deallocation -fno-aligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=0
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions    -fsized-deallocation    -faligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=1
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions -fno-sized-deallocation    -faligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=0
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions    -fsized-deallocation -fno-aligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=1
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions -fno-sized-deallocation -fno-aligned-allocation -Wno-non-c-typedef-for-linkage -DDEFAULT_DELETE=0
 
 namespace std {
   template <class T> struct type_identity {};
   enum class align_val_t : __SIZE_TYPE__ {};
 }
 
+static_assert(__has_extension(cxx_type_aware_allocators), "Verifying the type aware extension flag is set");
+
 using size_t = __SIZE_TYPE__;
 
 void *operator new(size_t); // #default_operator_new
diff --git a/clang/test/SemaCXX/type-aware-coroutines.cpp b/clang/test/SemaCXX/type-aware-coroutines.cpp
index a54d37c47dbd9..742e5f02b6dab 100644
--- a/clang/test/SemaCXX/type-aware-coroutines.cpp
+++ b/clang/test/SemaCXX/type-aware-coroutines.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -fcoroutines -fexceptions -Wall -Wpedantic
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fcoroutines -fexceptions -Wall -Wpedantic
 
 
 #include "Inputs/std-coroutine.h"
diff --git a/clang/test/SemaCXX/type-aware-new-constexpr.cpp b/clang/test/SemaCXX/type-aware-new-constexpr.cpp
index 105610cd103e6..4098beb56376e 100644
--- a/clang/test/SemaCXX/type-aware-new-constexpr.cpp
+++ b/clang/test/SemaCXX/type-aware-new-constexpr.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions    -fsized-deallocation    -faligned-allocation 
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions -fno-sized-deallocation    -faligned-allocation 
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions    -fsized-deallocation -fno-aligned-allocation 
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions -fno-sized-deallocation -fno-aligned-allocation 
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions    -fsized-deallocation    -faligned-allocation -fexperimental-new-constant-interpreter 
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions -fno-sized-deallocation    -faligned-allocation -fexperimental-new-constant-interpreter 
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions    -fsized-deallocation -fno-aligned-allocation -fexperimental-new-constant-interpreter 
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions -fno-sized-deallocation -fno-aligned-allocation -fexperimental-new-constant-interpreter 
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation    -faligned-allocation -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation    -faligned-allocation -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation -fno-aligned-allocation -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation -fno-aligned-allocation -fexperimental-new-constant-interpreter
 
 
 namespace std {
diff --git a/clang/test/SemaCXX/type-aware-new-delete-arrays.cpp b/clang/test/SemaCXX/type-aware-new-delete-arrays.cpp
index b1c73236476c4..37d645f894a16 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-arrays.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-arrays.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions    -fsized-deallocation    -faligned-allocation  -Wall -Wpedantic
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions -fno-sized-deallocation    -faligned-allocation  -Wall -Wpedantic
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions -fno-sized-deallocation -fno-aligned-allocation  -Wall -Wpedantic
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -fexceptions    -fsized-deallocation -fno-aligned-allocation  -Wall -Wpedantic
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation    -faligned-allocation  -Wall -Wpedantic
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation    -faligned-allocation  -Wall -Wpedantic
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation -fno-aligned-allocation  -Wall -Wpedantic
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation -fno-aligned-allocation  -Wall -Wpedantic
 
 namespace std {
   template <class T> struct type_identity {};
diff --git a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp
index c85b92718479a..87dc58861ee81 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26    -fsized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 -fno-sized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 -fno-sized-deallocation -fno-aligned-allocation
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26    -fsized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26    -fsized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26 -fno-sized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26 -fno-sized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26    -fsized-deallocation -fno-aligned-allocation
 
 namespace std {
   template <class T> struct type_identity {};
diff --git a/clang/test/SemaCXX/type-aware-new-delete-basic-in-class-declarations.cpp b/clang/test/SemaCXX/type-aware-new-delete-basic-in-class-declarations.cpp
index 34bd1d4206be1..0a557ac794f78 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-basic-in-class-declarations.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-basic-in-class-declarations.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify=expected,precxx26 %s           -std=c++23
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s                             -std=c++26
+// RUN: %clang_cc1 -triple arm64-apple-macosx -std=c++26 -fsyntax-only -verify=expected,clangext %s
+// RUN: %clang_cc1 -triple arm64-apple-macosx -std=c++26 -Wno-ext-cxx-type-aware-allocators -fsyntax-only -verify %s
 
 namespace std {
   template <class T> struct type_identity {};
@@ -13,24 +13,24 @@ using size_t = __SIZE_TYPE__;
 struct S {
   void *operator new(std::type_identity<S>, size_t, std::align_val_t); // #1
   void operator delete(std::type_identity<S>, void *, size_t, std::align_val_t); // #2
-  // precxx26-warning@#1 {{type aware allocators are a C++2c extension}}
-  // precxx26-warning@#2 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#1 {{type aware allocators are a Clang extension}}
+  // clangext-warning@#2 {{type aware allocators are a Clang extension}}
   void operator delete(S *, std::destroying_delete_t);
 };
 
 template <typename T> struct S2 {
   void *operator new(std::type_identity<S2<T>>, size_t, std::align_val_t); // #3
   void operator delete(std::type_identity<S2<T>>, void *, size_t, std::align_val_t); // #4
-  // precxx26-warning@#3 {{type aware allocators are a C++2c extension}}
-  // precxx26-warning@#4 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#3 {{type aware allocators are a Clang extension}}
+  // clangext-warning@#4 {{type aware allocators are a Clang extension}}
   void operator delete(S2 *, std::destroying_delete_t);
 };
 
 struct S3 {
   template <typename T> void *operator new(std::type_identity<T>, size_t, std::align_val_t); // #5
   template <typename T> void operator delete(std::type_identity<T>, void *, size_t, std::align_val_t); // #6
-  // precxx26-warning@#5 {{type aware allocators are a C++2c extension}}
-  // precxx26-warning@#6 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#5 {{type aware allocators are a Clang extension}}
+  // clangext-warning@#6 {{type aware allocators are a Clang extension}}
   void operator delete(S3 *, std::destroying_delete_t);
 };
 
@@ -38,34 +38,34 @@ struct S4 {
   template <typename T> void *operator new(std::type_identity<T>, size_t, std::align_val_t); // #7
   template <typename T> void operator delete(std::type_identity<T>, void *, size_t, std::align_val_t); // #8
   template <typename T> void operator delete(std::type_identity<T>, S4 *, std::destroying_delete_t, size_t, std::align_val_t); // #9
-  // precxx26-warning@#7 {{type aware allocators are a C++2c extension}}
-  // precxx26-warning@#8 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#7 {{type aware allocators are a Clang extension}}
+  // clangext-warning@#8 {{type aware allocators are a Clang extension}}
   // expected-error@#9 {{destroying delete is not permitted to be type aware}}
 };
 
 struct S5 {
   template <typename T> void operator delete(std::type_identity<T>, T *, size_t, std::align_val_t); // #10
   // expected-error@#10 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter}}
-  // precxx26-warning@#10 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#10 {{type aware allocators are a Clang extension}}
 };
 
 struct S6 {
   template <typename T> void *operator new(std::type_identity<S6>, T, std::align_val_t); // #11
   // expected-error@#11 {{type aware 'operator new' cannot take a dependent type as its 2nd parameter}}
-  // precxx26-warning@#11 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#11 {{type aware allocators are a Clang extension}}
   template <typename T> void operator delete(std::type_identity<S6>, T, size_t, std::align_val_t); // #12
   // expected-error@#12 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter}}
-  // precxx26-warning@#12 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#12 {{type aware allocators are a Clang extension}}
 };
 
 template <typename U>
 struct S7 {
   template <typename T> void *operator new(std::type_identity<T>, U, std::align_val_t); // #13
   // expected-error@#13 {{type aware 'operator new' cannot take a dependent type as its 2nd parameter;}}
-  // precxx26-warning@#13 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#13 {{type aware allocators are a Clang extension}}
   template <typename T> void operator delete(std::type_identity<T>, U, size_t, std::align_val_t); // #14
   // expected-error@#14 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter;}}
-  // precxx26-warning@#14 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#14 {{type aware allocators are a Clang extension}}
   template <typename T> void operator delete(std::type_identity<T>, S7 *, std::destroying_delete_t, U, std::align_val_t); // #15
   // expected-error@#15 {{destroying delete is not permitted to be type aware}}
   void operator delete(S7 *, std::destroying_delete_t, U); // #16
@@ -80,10 +80,10 @@ void f() {
 struct S8 {
   template <typename T, typename U> void *operator new(std::type_identity<T>, U, std::align_val_t); // #17
   // expected-error@#17 {{type aware 'operator new' cannot take a dependent type as its 2nd parameter;}}
-  // precxx26-warning@#17 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#17 {{type aware allocators are a Clang extension}}
   template <typename T, typename U> void operator delete(std::type_identity<T>, U, size_t, std::align_val_t); // #18
   // expected-error@#18 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter;}}
-  // precxx26-warning@#18 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#18 {{type aware allocators are a Clang extension}}
   template <typename T, typename U> void operator delete(std::type_identity<T>, S8 *, std::destroying_delete_t, U, std::align_val_t); // #19
   // expected-error@#19 {{destroying delete is not permitted to be type aware}}
 };
@@ -95,23 +95,23 @@ using UsingAlias = std::type_identity<float>;
 struct S9 {
   void *operator new(Alias<size_t>, std::align_val_t);
   template <typename T> void *operator new(Alias<std::type_identity<T>>, Alias<size_t>, std::align_val_t); // #20
-  // precxx26-warning@#20 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#20 {{type aware allocators are a Clang extension}}
   void *operator new(Alias<std::type_identity<int>>, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   template <typename T> void operator delete(Alias<std::type_identity<T>>, void *, size_t, std::align_val_t); // #21
-  // precxx26-warning@#21{{type aware allocators are a C++2c extension}}
+  // clangext-warning@#21{{type aware allocators are a Clang extension}}
   void operator delete(Alias<std::type_identity<int>>, void *, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
 };
 struct S10 {
   template <typename T> void *operator new(TypeIdentityAlias<T>, size_t, std::align_val_t); // #22
-  // precxx26-warning@#22 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@#22 {{type aware allocators are a Clang extension}}
   void *operator new(TypeIdentityAlias<int>, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   template <typename T> void operator delete(TypeIdentityAlias<T>, void *, size_t, std::align_val_t); // #23
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   void operator delete(TypeIdentityAlias<int>, void *, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
 };
 
 void test() {
@@ -123,28 +123,28 @@ void test() {
 
 struct S11 {
   template <typename T> void *operator new(TypedefAlias, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   void *operator new(TypedefAlias, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   template <typename T> void operator delete(TypedefAlias, void *, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   void operator delete(TypedefAlias, void *, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
 };
 struct S12 {
   template <typename T> void *operator new(UsingAlias, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   void *operator new(UsingAlias, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   template <typename T> void operator delete(UsingAlias, void *, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   void operator delete(UsingAlias, void *, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
 };
 
 struct S13 {
   void *operator new(std::type_identity<S13>, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
   void operator delete(std::type_identity<S13>, void*, size_t, std::align_val_t);
-  // precxx26-warning@-1 {{type aware allocators are a C++2c extension}}
+  // clangext-warning@-1 {{type aware allocators are a Clang extension}}
 };
diff --git a/clang/test/SemaCXX/type-aware-new-delete-basic-resolution.cpp b/clang/test/SemaCXX/type-aware-new-delete-basic-resolution.cpp
index 786899295f627..978f6ee5cf6c6 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-basic-resolution.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-basic-resolution.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s        -std=c++26 -fexceptions    -fsized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s        -std=c++26 -fexceptions -fno-sized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s        -std=c++26 -fexceptions    -fsized-deallocation -fno-aligned-allocation
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s        -std=c++26 -fexceptions -fno-sized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions    -fsized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fno-sized-deallocation -fno-aligned-allocation
 
 namespace std {
   template <class T> struct type_identity {};
diff --git a/clang/test/SemaCXX/type-aware-new-delete-qualifiers.cpp b/clang/test/SemaCXX/type-aware-new-delete-qualifiers.cpp
index af5386b25451f..3a616226b8720 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-qualifiers.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-qualifiers.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -DNO_TADD -std=c++26    -fsized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -fsyntax-only -verify %s -DNO_TADD -std=c++26 -fno-sized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -fsyntax-only -verify %s -DNO_TADD -std=c++26 -fno-sized-deallocation -fno-aligned-allocation
-// RUN: %clang_cc1 -fsyntax-only -verify %s -DNO_TADD -std=c++26    -fsized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -Wno-ext-cxx-type-aware-allocators -verify %s -DNO_TADD -std=c++26    -fsized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -Wno-ext-cxx-type-aware-allocators -verify %s -DNO_TADD -std=c++26 -fno-sized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -Wno-ext-cxx-type-aware-allocators -verify %s -DNO_TADD -std=c++26 -fno-sized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -Wno-ext-cxx-type-aware-allocators -verify %s -DNO_TADD -std=c++26    -fsized-deallocation -fno-aligned-allocation
 namespace std {
   template <class T> struct type_identity {
     typedef T type;
diff --git a/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp b/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp
index 30fea464a8dc5..06374bce6e69c 100644
--- a/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp
+++ b/clang/test/SemaCXX/type-aware-new-delete-transparent-contexts.cpp
@@ -2,9 +2,9 @@
 // RUN: mkdir %t
 // RUN: split-file %s %t
 
-// RUN: %clang_cc1  -fsyntax-only -verify %t/testing.cpp -std=c++26 -fexceptions -DTRANSPARENT_DECL=0
-// RUN: %clang_cc1  -fsyntax-only -verify %t/testing.cpp -std=c++26 -fexceptions -DTRANSPARENT_DECL=1
-// RUN: %clang_cc1  -fsyntax-only -verify %t/module_testing.cppm -std=c++26 -fexceptions -DTRANSPARENT_DECL=2
+// RUN: %clang_cc1  -fsyntax-only -verify %t/testing.cpp         -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -DTRANSPARENT_DECL=0
+// RUN: %clang_cc1  -fsyntax-only -verify %t/testing.cpp         -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -DTRANSPARENT_DECL=1
+// RUN: %clang_cc1  -fsyntax-only -verify %t/module_testing.cppm -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -DTRANSPARENT_DECL=2
 
 //--- module_testing.cppm
 // expected-no-diagnostics
diff --git a/clang/test/SemaCXX/type-aware-new-invalid-type-identity.cpp b/clang/test/SemaCXX/type-aware-new-invalid-type-identity.cpp
index d0242e43edcf1..502f4fab6b519 100644
--- a/clang/test/SemaCXX/type-aware-new-invalid-type-identity.cpp
+++ b/clang/test/SemaCXX/type-aware-new-invalid-type-identity.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=0
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=1
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=2
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=3
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=4
-// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s          -std=c++26 
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=0
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=1
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=2
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=3
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26 -DINVALID_TYPE_IDENTITY_VERSION=4
+// RUN: %clang_cc1 -triple arm64-apple-macosx -fsyntax-only -verify %s -Wno-ext-cxx-type-aware-allocators -std=c++26
 
 namespace std {
 #if !defined(INVALID_TYPE_IDENTITY_VERSION)
diff --git a/clang/test/SemaCXX/type-aware-placement-operators.cpp b/clang/test/SemaCXX/type-aware-placement-operators.cpp
index b64832a76e20b..3ba5c4de4a54f 100644
--- a/clang/test/SemaCXX/type-aware-placement-operators.cpp
+++ b/clang/test/SemaCXX/type-aware-placement-operators.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s        -std=c++26 -fexceptions -fcxx-exceptions    -fsized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -fsyntax-only -verify %s        -std=c++26 -fexceptions -fcxx-exceptions -fno-sized-deallocation    -faligned-allocation
-// RUN: %clang_cc1 -fsyntax-only -verify %s        -std=c++26 -fexceptions -fcxx-exceptions -fno-sized-deallocation -fno-aligned-allocation
-// RUN: %clang_cc1 -fsyntax-only -verify %s        -std=c++26 -fexceptions -fcxx-exceptions    -fsized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions    -fsized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions -fno-sized-deallocation    -faligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions -fno-sized-deallocation -fno-aligned-allocation
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++26 -Wno-ext-cxx-type-aware-allocators -fexceptions -fcxx-exceptions    -fsized-deallocation -fno-aligned-allocation
 
 namespace std {
   template <class T> struct type_identity {};
diff --git a/clang/test/SemaCXX/uninitialized.cpp b/clang/test/SemaCXX/uninitialized.cpp
index 251e888f73973..cc368c22e0776 100644
--- a/clang/test/SemaCXX/uninitialized.cpp
+++ b/clang/test/SemaCXX/uninitialized.cpp
@@ -185,6 +185,10 @@ void test_const_ptr() {
   const int *ptr2;
   foo(ptr); // expected-warning {{variable 'ptr' is uninitialized when used here}}
   foobar(&ptr2);
+  int *ptr3; // expected-note {{initialize the variable 'ptr3' to silence this warning}}
+  const int *ptr4; // expected-note {{initialize the variable 'ptr4' to silence this warning}}
+  bar(ptr3); // expected-warning {{variable 'ptr3' is uninitialized when used here}}
+  bar(ptr4); // expected-warning {{variable 'ptr4' is uninitialized when used here}}
 }
 }
 
diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
index d64ed1e5f260a..d82e2484ace1e 100644
--- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -6196,6 +6196,8 @@ class Return {
   Mutex mu;
   Foo foo GUARDED_BY(mu);
   Foo* foo_ptr PT_GUARDED_BY(mu);
+  Foo foo_depr GUARDED_VAR;          // test deprecated attribute
+  Foo* foo_ptr_depr PT_GUARDED_VAR;  // test deprecated attribute
 
   Foo returns_value_locked() {
     MutexLock lock(&mu);
@@ -6297,6 +6299,18 @@ class Return {
     return *foo_ptr;          // expected-warning {{returning the value that 'foo_ptr' points to by reference requires holding mutex 'mu' exclusively}}
   }
 
+  Foo *returns_ptr_deprecated() {
+    return &foo_depr;          // expected-warning {{writing variable 'foo_depr' requires holding any mutex exclusively}}
+  }
+
+  Foo *returns_pt_ptr_deprecated() {
+    return foo_ptr_depr;       // expected-warning {{writing the value pointed to by 'foo_ptr_depr' requires holding any mutex exclusively}}
+  }
+
+  Foo &returns_ref_deprecated() {
+    return *foo_ptr_depr;      // expected-warning {{writing the value pointed to by 'foo_ptr_depr' requires holding any mutex exclusively}}
+  }
+
   // FIXME: Basic alias analysis would help catch cases like below.
   Foo *returns_ptr_alias() {
     mu.Lock();
diff --git a/clang/test/SemaCXX/warn-uninitialized-const-pointer.cpp b/clang/test/SemaCXX/warn-uninitialized-const-pointer.cpp
new file mode 100644
index 0000000000000..62802ba7375cc
--- /dev/null
+++ b/clang/test/SemaCXX/warn-uninitialized-const-pointer.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -Wuninitialized-const-pointer -verify %s
+
+template <class T>
+void ignore_template(const T *) {}
+void ignore(const int *) {}
+void dont_ignore_non_empty(const int *) { ; } 
+void dont_ignore_block(const int *) { {} }
+void dont_ignore_try_block(const int *) try {
+} catch (...) {
+}
+int const_ptr_use(const int *);
+
+void f(int a) {
+  int i;
+  const_ptr_use(&i);             // expected-warning {{variable 'i' is uninitialized when passed as a const pointer argument here}}
+  int j = j + const_ptr_use(&j); // expected-warning {{variable 'j' is uninitialized when used within its own initialization}}
+  int k = k;                     // expected-warning {{variable 'k' is uninitialized when used within its own initialization}}
+  const_ptr_use(&k);
+
+  // Only report if a variable is always uninitialized at the point of use
+  int l;
+  if (a < 42)
+    l = 1;
+  const_ptr_use(&l);
+
+  // Don't report if the called function is known to be empty.
+  int m;
+  ignore_template(&m);
+  ignore(&m);
+  dont_ignore_non_empty(&m); // expected-warning {{variable 'm' is uninitialized when passed as a const pointer argument here}}
+  int n;
+  dont_ignore_block(&n); // expected-warning {{variable 'n' is uninitialized when passed as a const pointer argument here}}
+  int o;
+  dont_ignore_try_block(&o); // expected-warning {{variable 'o' is uninitialized when passed as a const pointer argument here}}
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
index a7c19bcac1607..d6c32ca7baa5d 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-libc-functions.cpp
@@ -125,10 +125,13 @@ void safe_examples(std::string s1, int *p) {
   fprintf((FILE*)0, s1.c_str(), __PRETTY_FUNCTION__, *p, "hello", s1.c_str());        // no warn
 
   char a[10];
+  char c = 'c';
 
   snprintf(a, sizeof a, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", s1.c_str());         // no warn
   snprintf(a, sizeof(decltype(a)), "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", s1.c_str());          // no warn
   snprintf(a, 10, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", s1.c_str());                // no warn
+  snprintf(&c, 1, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", s1.c_str());                // no warn
+  snprintf(nullptr, 0, "%s%d%s%p%s", __PRETTY_FUNCTION__, *p, "hello", s1.c_str());           // no warn
 }
 
 
diff --git a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
new file mode 100644
index 0000000000000..6cb3e56c20f0e
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl
@@ -0,0 +1,66 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify
+
+float test_no_second_arg(float3 p0) {
+  return refract(p0);
+  // expected-error@-1 {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}}
+}
+
+float test_no_third_arg(float3 p0) {
+  return refract(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}}
+}
+
+float test_too_many_arg(float2 p0) {
+  return refract(p0, p0, p0, p0);
+  // expected-error@-1 {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}}
+}
+
+float test_double_inputs(double p0, double p1, double p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+}
+
+float test_int_inputs(int p0, int p1, int p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}}
+}
+
+float1 test_vec1_inputs(float1 p0, float1 p1, float1 p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with T = float1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, vector<float, 1>>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with T = float1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, vector<float, 1>>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, half>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 1]: no type named 'Type' in 'hlsl::__detail::enable_if<false, float>'}}
+}
+
+typedef float float5 __attribute__((ext_vector_type(5)));
+
+float5 test_vec5_inputs(float5 p0, float5 p1,  float p2) {
+  return refract(p0, p1, p2);
+  // expected-error@-1  {{no matching function for call to 'refract'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: deduced conflicting types for parameter 'T' ('float5' (vector of 5 'float' values) vs. 'float')}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: deduced conflicting types for parameter 'T' ('float5' (vector of 5 'float' values) vs. 'float')}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 5]: no type named 'Type' in 'hlsl::__detail::enable_if<false, half>'}}
+  // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 5]: no type named 'Type' in 'hlsl::__detail::enable_if<false, float>'}}
+}
diff --git a/clang/test/SemaObjC/ptrauth-pointers.m b/clang/test/SemaObjC/ptrauth-pointers.m
new file mode 100644
index 0000000000000..5a3bd49d74f0d
--- /dev/null
+++ b/clang/test/SemaObjC/ptrauth-pointers.m
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -fblocks -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics -verify %s
+
+#if __has_feature(ptrauth_objc_signable_class)
+@class TestClass;
+typedef TestClass *ClassPtr;
+typedef void(^BlockPtr)();
+@interface TestClass {
+@public
+  __ptrauth(2, 1, 1) Class a;
+  __ptrauth(2, 1, 3) volatile Class vi;
+  __ptrauth(2, 1, 3) const Class ci;
+  __ptrauth(2, 1, 1) id b;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'id' is invalid}}
+  __ptrauth(2, 1, 2) ClassPtr c;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'ClassPtr' (aka 'TestClass *') is invalid}}
+  __ptrauth(2, 1, 2) BlockPtr d;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'BlockPtr' (aka 'void (^)()') is invalid}}
+}
+
+struct TestStruct {
+  __ptrauth(2, 1, 3) Class e;
+  __ptrauth(2, 1, 3) volatile Class vi;
+  __ptrauth(2, 1, 3) const Class ci;
+  __ptrauth(2, 1, 4) id f;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'id' is invalid}}
+  __ptrauth(2, 1, 5) ClassPtr g;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'ClassPtr' (aka 'TestClass *') is invalid}}
+  __ptrauth(2, 1, 2) BlockPtr h;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'BlockPtr' (aka 'void (^)()') is invalid}}
+};
+
+@end
+
+void foo() {
+  __ptrauth(2, 1, 3) Class i;
+  __ptrauth(2, 1, 3) volatile Class vi;
+  __ptrauth(2, 1, 3) const Class ci = 0;
+  __ptrauth(2, 1, 4) id j;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'id' is invalid}}
+  __ptrauth(2, 1, 5) ClassPtr k;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'ClassPtr' (aka 'TestClass *') is invalid}}
+  __ptrauth(2, 1, 2) BlockPtr l;
+  // expected-error@-1 {{'__ptrauth' qualifier only applies to pointer or pointer sized integer types; 'BlockPtr' (aka 'void (^)()') is invalid}}
+}
+
+#endif
diff --git a/clang/test/SemaObjC/ptrauth-qualifier.m b/clang/test/SemaObjC/ptrauth-qualifier.m
index 98d0248f169be..74bbe6f09899b 100644
--- a/clang/test/SemaObjC/ptrauth-qualifier.m
+++ b/clang/test/SemaObjC/ptrauth-qualifier.m
@@ -18,12 +18,24 @@ @interface Foo
 @property void *__ptrauth(1, 0, 1) invalid2;
 // expected-error@-1 {{property may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}}
 
+@property unsigned long long __ptrauth(1, 1, 1) invalid3;
+// expected-error@-1 {{property may not be qualified with '__ptrauth'; type is '__ptrauth(1,1,1) unsigned long long'}}
+
+@property unsigned long long __ptrauth(1, 0, 1) invalid4;
+// expected-error@-1 {{property may not be qualified with '__ptrauth'; type is '__ptrauth(1,0,1) unsigned long long'}}
+
 - (void *__ptrauth(1, 1, 1))invalid5;
 // expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}}
 
 - (void *__ptrauth(1, 0, 1))invalid6;
 // expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}}
 
+- (unsigned long long __ptrauth(1, 1, 1))invalid7;
+// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is '__ptrauth(1,1,1) unsigned long long'}}
+
+- (unsigned long long __ptrauth(1, 0, 1))invalid8;
+// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is '__ptrauth(1,0,1) unsigned long long'}}
+
 - (void)invalid9:(void *__ptrauth(1, 1, 1))a;
 // expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}}
 // expected-note@-2 {{method 'invalid9:' declared here}}
@@ -32,10 +44,17 @@ - (void)invalid10:(void *__ptrauth(1, 0, 1))a;
 // expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}}
 // expected-note@-2 {{method 'invalid10:' declared here}}
 
+- (void)invalid11:(unsigned long long __ptrauth(1, 1, 1))a;
+// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is '__ptrauth(1,1,1) unsigned long long'}}
+// expected-note@-2 {{method 'invalid11:' declared here}}
+
+- (void)invalid12:(unsigned long long __ptrauth(1, 0, 1))a;
+// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is '__ptrauth(1,0,1) unsigned long long'}}
+// expected-note@-2 {{method 'invalid12:' declared here}}
 @end
 
 @implementation Foo
-// expected-warning@-1 2{{method definition for}}
+// expected-warning@-1 4{{method definition for}}
 
 - (void *__ptrauth(1, 1, 1))invalid13 {
 // expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}}
@@ -47,6 +66,16 @@ @implementation Foo
   return 0;
 }
 
+- (unsigned long long __ptrauth(1, 1, 1))invalid15 {
+// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is '__ptrauth(1,1,1) unsigned long long'}}
+  return 0;
+}
+
+- (unsigned long long __ptrauth(1, 0, 1))invalid16 {
+// expected-error@-1 {{return type may not be qualified with '__ptrauth'; type is '__ptrauth(1,0,1) unsigned long long'}}
+  return 0;
+}
+
 - (void)invalid17:(void *__ptrauth(1, 1, 1))a {
 // expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,1,1)'}}
 }
@@ -55,4 +84,12 @@ - (void)invalid18:(void *__ptrauth(1, 0, 1))a {
 // expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is 'void *__ptrauth(1,0,1)'}}
 }
 
+- (void)invalid19:(unsigned long long __ptrauth(1, 1, 1))a {
+// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is '__ptrauth(1,1,1) unsigned long long'}}
+}
+
+- (void)invalid20:(unsigned long long __ptrauth(1, 0, 1))a {
+// expected-error@-1 {{parameter type may not be qualified with '__ptrauth'; type is '__ptrauth(1,0,1) unsigned long long'}}
+}
+
 @end
diff --git a/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp b/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp
new file mode 100644
index 0000000000000..e0aee123fe754
--- /dev/null
+++ b/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct ImplicitCtorDtor{};
+
+struct ImplDeletedCtor{
+  ImplDeletedCtor(int i);
+};
+
+struct DefaultedCtor {
+  DefaultedCtor() = default;
+};
+
+struct ImpledCtor {
+  ImpledCtor() = default;
+};
+
+
+struct DeletedCtor {
+  DeletedCtor() = delete;
+};
+
+struct ImpledDtor {
+  ~ImpledDtor();
+};
+
+struct DefaultedDtor {
+  ~DefaultedDtor() = default;
+};
+
+struct DeletedDtor {
+  ~DeletedDtor() = delete;
+};
+
+struct ImplicitDelDtor {
+  DeletedDtor d;
+};
+
+void private_uses(ImplicitCtorDtor &CDT, ImplDeletedCtor &IDC,
+                  DefaultedCtor &DefC, ImpledCtor &IC, DeletedCtor &DelC,
+                  ImpledDtor &ID, DefaultedDtor &DefD, DeletedDtor &DelD,
+                  ImplicitDelDtor &IDD) {
+
+#pragma acc parallel private(CDT)
+  ;
+
+  // expected-error@+1{{variable of type 'ImplDeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+#pragma acc parallel private(IDC)
+  ;
+
+#pragma acc parallel private(DefC)
+  ;
+
+#pragma acc parallel private(IC)
+  ;
+
+  // expected-error@+1{{variable of type 'DeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+#pragma acc parallel private(DelC)
+  ;
+
+#pragma acc parallel private(ID)
+  ;
+
+#pragma acc parallel private(DefD)
+  ;
+
+  // expected-error@+1{{variable of type 'DeletedDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+#pragma acc parallel private(DelD)
+  ;
+
+  // expected-error@+1{{variable of type 'ImplicitDelDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+#pragma acc parallel private(IDD)
+  ;
+
+}
+
+template<typename T>
+void private_templ(T& t) {
+#pragma acc parallel private(t) // #PRIV
+  ;
+}
+
+void inst(ImplicitCtorDtor &CDT, ImplDeletedCtor &IDC,
+                  DefaultedCtor &DefC, ImpledCtor &IC, DeletedCtor &DelC,
+                  ImpledDtor &ID, DefaultedDtor &DefD, DeletedDtor &DelD,
+                  ImplicitDelDtor &IDD) {
+  private_templ(CDT);
+  // expected-error@#PRIV{{variable of type 'ImplDeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(IDC);
+  private_templ(DefC);
+  private_templ(IC);
+  // expected-error@#PRIV{{variable of type 'DeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(DelC);
+  private_templ(ID);
+  private_templ(DefD);
+  // expected-error@#PRIV{{variable of type 'DeletedDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(DelD);
+  // expected-error@#PRIV{{variable of type 'ImplicitDelDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}}
+  // expected-note@+1{{in instantiation}}
+  private_templ(IDD);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
new file mode 100644
index 0000000000000..55d705e6ad238
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -0,0 +1,242 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -verify -emit-llvm -o - %s
+
+typedef float  v16f  __attribute__((ext_vector_type(16)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef float  v2f   __attribute__((ext_vector_type(2)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef half   v16h  __attribute__((ext_vector_type(16)));
+typedef half   v32h   __attribute__((ext_vector_type(32)));
+typedef __bf16 v32bf16 __attribute__((ext_vector_type(32)));
+typedef __bf16 v16bf16 __attribute__((ext_vector_type(16)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
+typedef int    v16i   __attribute__((ext_vector_type(16)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8bf16 c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(mod, a, 0, b, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, mod, b, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a constant integer}}
+  *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod)
+{
+  *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, v32bf16 b, v8bf16 c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_f16_16x16x64_f16(global v8h* out, v16h a, v32h b, v8h c, int index, int mod)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+  *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
index 5915393ae7f56..8fbffbeea0531 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl
@@ -8,3 +8,10 @@ void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local vo
   __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
   __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
 }
+
+void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset, int x) {
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, x, vindex, voffset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 3, vindex, voffset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl
index 74944f2d93c72..cb832b9aa4845 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl
@@ -5,6 +5,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -verify -o - %s
 // REQUIRES: amdgpu-registered-target
 
-void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) {
+void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int vindex, int offset, int soffset) {
   __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}}
+  __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}}
 }
diff --git a/clang/test/SemaSPIRV/BuiltIns/refract-errors.c b/clang/test/SemaSPIRV/BuiltIns/refract-errors.c
new file mode 100644
index 0000000000000..07486c2a60cbf
--- /dev/null
+++ b/clang/test/SemaSPIRV/BuiltIns/refract-errors.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 %s -triple spirv-pc-vulkan-compute -verify
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef _Float16 half;
+typedef half half2 __attribute__((ext_vector_type(2)));
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_spirv_refract(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0, float p1) {
+  return __builtin_spirv_refract(p0, p0, p1, p1);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float test_double_scalar_inputs(double p0, double p1, double p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double')}}
+}
+
+float test_int_scalar_inputs(int p0, int p1, int p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'int')}}
+}
+
+float test_float_and_half_inputs(float2 p0, half2 p1, float p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{first two arguments to '__builtin_spirv_refract' must have the same type}}
+}
+
+float test_float_and_half_2_inputs(float2 p0, float2 p1, half p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{all arguments to '__builtin_spirv_refract' must be of scalar or vector type with matching scalar element type: 'float2' (vector of 2 'float' values) vs 'half' (aka '_Float16')}}
+}
+
+float2 test_mismatch_vector_size_inputs(float2 p0, float3 p1, float p2) {
+  return __builtin_spirv_refract(p0, p1, p2);
+  //  expected-error@-1 {{first two arguments to '__builtin_spirv_refract' must have the same type}}
+}
diff --git a/clang/test/SemaTemplate/concepts-using-decl.cpp b/clang/test/SemaTemplate/concepts-using-decl.cpp
index fca69dea5c88f..41f7b6d2f8faa 100644
--- a/clang/test/SemaTemplate/concepts-using-decl.cpp
+++ b/clang/test/SemaTemplate/concepts-using-decl.cpp
@@ -176,3 +176,24 @@ void func() {
   f.foo<10, 10>(); // expected-error {{no matching member function for call to 'foo'}}
 }
 } // namespace heads_without_concepts.
+
+namespace GH146614 {
+
+template <typename T>
+struct base {
+    template <typename A>
+    void foo(A x)
+        requires (requires{x;})
+    {}
+};
+
+
+struct child : base<int> {
+  using base<int>::foo;
+  template <typename A>
+  void foo(A x)
+      requires (false)
+  {}
+};
+
+}
diff --git a/clang/test/SemaTemplate/gh138371.cpp b/clang/test/SemaTemplate/gh138371.cpp
new file mode 100644
index 0000000000000..1b8fe09eb7e97
--- /dev/null
+++ b/clang/test/SemaTemplate/gh138371.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// This would previously trigger a failed assertion when instantiating the
+// template which uses an overloaded call operator because the end location
+// for the expression came from a macro expansion.
+
+#define ASSIGN_OR_RETURN(...)  (__VA_ARGS__)
+
+struct Loc {
+  int operator()(const char* _Nonnull f = __builtin_FILE()) const;
+};
+
+template <typename Ty>
+void f() {
+  ASSIGN_OR_RETURN(Loc()());
+}
+
+void test() {
+  f<int>();
+}
+
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index c0efbb7588ccb..24ad3cb42254d 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -284,7 +284,7 @@ static bool fillRanges(MemoryBuffer *Code,
   if (Offsets.size() == 1 && EmptyLengths) {
     Length = Sources.getFileOffset(Sources.getLocForEndOfFile(ID)) - Offsets[0];
   } else if (Offsets.size() != Lengths.size()) {
-    errs() << "error: number of -offset and -length arguments must match\n";
+    errs() << "error: number of -offset and -length arguments must match.\n";
     return true;
   }
   for (unsigned I = 0, E = Offsets.size(), CodeSize = Code->getBufferSize();
@@ -296,16 +296,12 @@ static bool fillRanges(MemoryBuffer *Code,
     }
     if (!EmptyLengths)
       Length = Lengths[I];
-    if (Length == 0) {
-      errs() << "error: length should be at least 1\n";
-      return true;
-    }
     if (Offset + Length > CodeSize) {
       errs() << "error: invalid length " << Length << ", offset + length ("
-             << Offset + Length << ") is outside the file\n";
+             << Offset + Length << ") is outside the file.\n";
       return true;
     }
-    Ranges.push_back(tooling::Range(Offset, Length - 1));
+    Ranges.push_back(tooling::Range(Offset, Length));
   }
   return false;
 }
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 9089984fa4a54..75afa87947be4 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1438,10 +1438,6 @@ bool CursorVisitor::VisitNestedNameSpecifier(NestedNameSpecifier *NNS,
     return Visit(
         MakeCursorNamespaceRef(NNS->getAsNamespace(), Range.getBegin(), TU));
 
-  case NestedNameSpecifier::NamespaceAlias:
-    return Visit(MakeCursorNamespaceRef(NNS->getAsNamespaceAlias(),
-                                        Range.getBegin(), TU));
-
   case NestedNameSpecifier::TypeSpec: {
     // If the type has a form where we know that the beginning of the source
     // range matches up with a reference cursor. Visit the appropriate reference
@@ -1483,13 +1479,6 @@ bool CursorVisitor::VisitNestedNameSpecifierLoc(
 
       break;
 
-    case NestedNameSpecifier::NamespaceAlias:
-      if (Visit(MakeCursorNamespaceRef(NNS->getAsNamespaceAlias(),
-                                       Q.getLocalBeginLoc(), TU)))
-        return true;
-
-      break;
-
     case NestedNameSpecifier::TypeSpec:
       if (Visit(Q.getTypeLoc()))
         return true;
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index b6662b66206b2..2b1e266f07392 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SOURCES
   Indexing.cpp
   FatalErrorHandler.cpp
   Rewrite.cpp
+  Obsolete.cpp
 
   ADDITIONAL_HEADERS
   CIndexDiagnostic.h
diff --git a/clang/tools/libclang/Obsolete.cpp b/clang/tools/libclang/Obsolete.cpp
new file mode 100644
index 0000000000000..3596f76e1be6f
--- /dev/null
+++ b/clang/tools/libclang/Obsolete.cpp
@@ -0,0 +1,48 @@
+//===- Obsolete.cpp - Obsolete libclang functions and types -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--------------------------------------------------------------------===//
+//
+// This file contains libclang symbols whose underlying functionality has been
+// removed from Clang, but which need to be kept around so as to retain ABI
+// compatibility.
+//
+//===--------------------------------------------------------------------===//
+
+#include "clang-c/CXString.h"
+#include "clang-c/Index.h"
+#include "clang-c/Platform.h"
+#include "llvm/Support/raw_ostream.h"
+
+extern "C" {
+
+// The functions below used to be part of the C API for ARCMigrate, which has
+// since been removed from Clang; they already used to print an error if Clang
+// was compiled without arcmt support, so we continue doing so.
+CXRemapping clang_getRemappings(const char *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+CXRemapping clang_getRemappingsFromFileList(const char **, unsigned) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+unsigned clang_remap_getNumFiles(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return 0;
+}
+
+void clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+void clang_remap_dispose(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+} // extern "C"
diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map
index d140a71e771a0..3d9d2e268a611 100644
--- a/clang/tools/libclang/libclang.map
+++ b/clang/tools/libclang/libclang.map
@@ -327,6 +327,8 @@ LLVM_13 {
     clang_getRange;
     clang_getRangeEnd;
     clang_getRangeStart;
+    clang_getRemappings;
+    clang_getRemappingsFromFileList;
     clang_getResultType;
     clang_getSkippedRanges;
     clang_getSpecializedCursorTemplate;
@@ -387,6 +389,9 @@ LLVM_13 {
     clang_parseTranslationUnit;
     clang_parseTranslationUnit2;
     clang_parseTranslationUnit2FullArgv;
+    clang_remap_dispose;
+    clang_remap_getFilenames;
+    clang_remap_getNumFiles;
     clang_reparseTranslationUnit;
     clang_saveTranslationUnit;
     clang_sortCodeCompletionResults;
@@ -435,12 +440,12 @@ LLVM_20 {
     clang_getTypePrettyPrinted;
     clang_isBeforeInTranslationUnit;
     clang_visitCXXBaseClasses;
-    clang_visitCXXMethods;
 };
 
 LLVM_21 {
   global:
     clang_getFullyQualifiedName;
+    clang_visitCXXMethods;
     clang_Cursor_getGCCAssemblyTemplate;
     clang_Cursor_isGCCAssemblyHasGoto;
     clang_Cursor_getGCCAssemblyNumOutputs;
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index 49abe881eeabb..287122393446d 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -2925,6 +2925,19 @@ TEST_P(ASTMatchersTest, IsBitField) {
                          fieldDecl(isBitField(), hasName("b"))));
   EXPECT_TRUE(matches("struct C { int a : 2; int b : 4; };",
                       fieldDecl(isBitField(), hasBitWidth(2), hasName("a"))));
+  if (GetParam().isCXX()) {
+    // This test verifies 2 things:
+    // (1) That templates work correctly.
+    // (2) That the matcher does not crash on template-dependent bit widths.
+    EXPECT_TRUE(matches("template<int N> "
+                        "struct C { "
+                        "explicit C(bool x) : a(x) { } "
+                        "int a : N; "
+                        "int b : 4; "
+                        "}; "
+                        "template struct C<2>;",
+                        fieldDecl(isBitField(), hasBitWidth(2), hasName("a"))));
+  }
 }
 
 TEST_P(ASTMatchersTest, HasInClassInitializer) {
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index d17109aebc0f8..65d8b36c677bd 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -259,6 +259,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_NESTED_BOOL(SpacesInParensOptions, Other);
   CHECK_PARSE_NESTED_BOOL(SortIncludes, Enabled);
   CHECK_PARSE_NESTED_BOOL(SortIncludes, IgnoreCase);
+  CHECK_PARSE_NESTED_BOOL(SortIncludes, IgnoreExtension);
 }
 
 #undef CHECK_PARSE_BOOL
@@ -980,17 +981,20 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               IncludeStyle.IncludeIsMainSourceRegex, "abc$");
 
   Style.SortIncludes = {};
-  CHECK_PARSE("SortIncludes: true", SortIncludes,
-              FormatStyle::SortIncludesOptions(
-                  {/*Enabled=*/true, /*IgnoreCase=*/false}));
+  CHECK_PARSE(
+      "SortIncludes: true", SortIncludes,
+      FormatStyle::SortIncludesOptions(
+          {/*Enabled=*/true, /*IgnoreCase=*/false, /*IgnoreExtension=*/false}));
   CHECK_PARSE("SortIncludes: false", SortIncludes,
               FormatStyle::SortIncludesOptions({}));
-  CHECK_PARSE("SortIncludes: CaseInsensitive", SortIncludes,
-              FormatStyle::SortIncludesOptions(
-                  {/*Enabled=*/true, /*IgnoreCase=*/true}));
-  CHECK_PARSE("SortIncludes: CaseSensitive", SortIncludes,
-              FormatStyle::SortIncludesOptions(
-                  {/*Enabled=*/true, /*IgnoreCase=*/false}));
+  CHECK_PARSE(
+      "SortIncludes: CaseInsensitive", SortIncludes,
+      FormatStyle::SortIncludesOptions(
+          {/*Enabled=*/true, /*IgnoreCase=*/true, /*IgnoreExtension=*/false}));
+  CHECK_PARSE(
+      "SortIncludes: CaseSensitive", SortIncludes,
+      FormatStyle::SortIncludesOptions(
+          {/*Enabled=*/true, /*IgnoreCase=*/false, /*IgnoreExtension=*/false}));
   CHECK_PARSE("SortIncludes: Never", SortIncludes,
               FormatStyle::SortIncludesOptions({}));
 
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 994227efdd4f8..5194d65ed3637 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -1483,6 +1483,26 @@ TEST_F(SortIncludesTest, BlockCommentedOutIncludes) {
   verifyFormat(Code, sort(Code, "input.cpp", 0));
 }
 
+TEST_F(SortIncludesTest, IgnoreExtension) {
+  FmtStyle.SortIncludes.IgnoreExtension = true;
+
+  verifyFormat("#include <a.h>\n"
+               "#include <a.inc>\n"
+               "#include <a-util.h>",
+               sort("#include <a.inc>\n"
+                    "#include <a-util.h>\n"
+                    "#include <a.h>",
+                    "input.h"));
+
+  verifyFormat("#include <ab.h>\n"
+               "#include <ab-beta.h>\n"
+               "#include <ab-data.h>",
+               sort("#include <ab-data.h>\n"
+                    "#include <ab.h>\n"
+                    "#include <ab-beta.h>",
+                    "input.h"));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index e281a4945a862..ce7787ede0f5c 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -390,6 +390,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace);
   EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
 
+  Tokens = annotate("bool foo = requires { static_cast<Foo &&>(1); };");
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_PointerOrReference);
+
   Tokens = annotate("return s.operator int *();");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   // Not TT_FunctionDeclarationName.
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
index ddc663e2b6fd3..23a2df42ff08c 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
@@ -25,8 +25,8 @@ class NestedNameSpecifiersVisitor : public ExpectedLocationVisitor {
   bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) override {
     if (!NNS)
       return true;
-    if (const NamespaceDecl *ND =
-            NNS.getNestedNameSpecifier()->getAsNamespace())
+    if (const auto *ND = dyn_cast_if_present<NamespaceDecl>(
+            NNS.getNestedNameSpecifier()->getAsNamespace()))
       Match(ND->getName(), NNS.getLocalBeginLoc());
     return ExpectedLocationVisitor::TraverseNestedNameSpecifierLoc(NNS);
   }
diff --git a/clang/unittests/Tooling/RefactoringTest.cpp b/clang/unittests/Tooling/RefactoringTest.cpp
index 254d95bc20cb0..35d114343b517 100644
--- a/clang/unittests/Tooling/RefactoringTest.cpp
+++ b/clang/unittests/Tooling/RefactoringTest.cpp
@@ -748,7 +748,8 @@ class NestedNameSpecifierAVisitor : public TestVisitor {
 public:
   bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNSLoc) override {
     if (NNSLoc.getNestedNameSpecifier()) {
-      if (const NamespaceDecl* NS = NNSLoc.getNestedNameSpecifier()->getAsNamespace()) {
+      if (const auto *NS = dyn_cast_if_present<NamespaceDecl>(
+              NNSLoc.getNestedNameSpecifier()->getAsNamespace())) {
         if (NS->getName() == "a") {
           Replace = Replacement(*SM, &NNSLoc, "", Context->getLangOpts());
         }
diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index f14aae172f077..f59dd7b3722dd 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -1,7 +1,7 @@
 # The LLVM Version number information
 
 if(NOT DEFINED LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 21)
+  set(LLVM_VERSION_MAJOR 22)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp
index 0c30959b23e28..18c2a6c571c1f 100644
--- a/compiler-rt/lib/asan/asan_descriptions.cpp
+++ b/compiler-rt/lib/asan/asan_descriptions.cpp
@@ -449,10 +449,12 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size,
   // are put to the STACK region for unknown reasons. Check global first can
   // workaround this issue.
   // TODO: Look into whether there's a different solution to this problem.
+#if SANITIZER_AIX
   if (GetGlobalAddressInformation(addr, access_size, &data.global)) {
     data.kind = kAddressKindGlobal;
     return;
   }
+#endif
 
   if (GetHeapAddressInformation(addr, access_size, &data.heap)) {
     data.kind = kAddressKindHeap;
@@ -471,6 +473,14 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size,
     return;
   }
 
+// GetGlobalAddressInformation is called earlier on AIX due to a workaround
+#if !SANITIZER_AIX
+  if (GetGlobalAddressInformation(addr, access_size, &data.global)) {
+    data.kind = kAddressKindGlobal;
+    return;
+  }
+#endif
+
   data.kind = kAddressKindWild;
   data.wild.addr = addr;
   data.wild.access_size = access_size;
diff --git a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
index e6ddbb00b843c..71f810e9d9724 100644
--- a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp
@@ -11,44 +11,60 @@
 
 #include "common.h"
 #include "mem_map.h"
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+
 #include <algorithm>
-#include <fstream>
+#include <vector>
 
 namespace scudo {
 
-static uptr getResidentMemorySize() {
-  if (!SCUDO_LINUX)
-    UNREACHABLE("Not implemented!");
-  uptr Size;
-  uptr Resident;
-  std::ifstream IFS("/proc/self/statm");
-  IFS >> Size;
-  IFS >> Resident;
-  return Resident * getPageSizeCached();
+static void getResidentPages(void *BaseAddress, size_t TotalPages,
+                             size_t *ResidentPages) {
+  std::vector<unsigned char> Pages(TotalPages, 0);
+  ASSERT_EQ(
+      0, mincore(BaseAddress, TotalPages * getPageSizeCached(), Pages.data()))
+      << strerror(errno);
+  *ResidentPages = 0;
+  for (unsigned char Value : Pages) {
+    if (Value & 1) {
+      ++*ResidentPages;
+    }
+  }
 }
 
-// Fuchsia needs getResidentMemorySize implementation.
+// Fuchsia needs getResidentPages implementation.
 TEST(ScudoCommonTest, SKIP_ON_FUCHSIA(ResidentMemorySize)) {
-  uptr OnStart = getResidentMemorySize();
-  EXPECT_GT(OnStart, 0UL);
-
-  const uptr Size = 1ull << 30;
-  const uptr Threshold = Size >> 3;
+  // Make sure to have the size of the map on a page boundary.
+  const uptr PageSize = getPageSizeCached();
+  const size_t NumPages = 1000;
+  const uptr SizeBytes = NumPages * PageSize;
 
   MemMapT MemMap;
-  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "ResidentMemorySize"));
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, SizeBytes, "ResidentMemorySize"));
   ASSERT_NE(MemMap.getBase(), 0U);
-  void *P = reinterpret_cast<void *>(MemMap.getBase());
-  EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
-
-  memset(P, 1, Size);
-  EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
-
-  MemMap.releasePagesToOS(MemMap.getBase(), Size);
-  EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
 
-  memset(P, 1, Size);
-  EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
+  void *P = reinterpret_cast<void *>(MemMap.getBase());
+  size_t ResidentPages;
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(0U, ResidentPages);
+
+  // Make the entire map resident.
+  memset(P, 1, SizeBytes);
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(NumPages, ResidentPages);
+
+  // Should release the memory to the kernel immediately.
+  MemMap.releasePagesToOS(MemMap.getBase(), SizeBytes);
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(0U, ResidentPages);
+
+  // Make the entire map resident again.
+  memset(P, 1, SizeBytes);
+  getResidentPages(P, NumPages, &ResidentPages);
+  EXPECT_EQ(NumPages, ResidentPages);
 
   MemMap.unmap();
 }
diff --git a/compiler-rt/lib/tysan/lit.cfg b/compiler-rt/lib/tysan/lit.cfg
index e3ef6c9c97147..c906c03cc3fb2 100644
--- a/compiler-rt/lib/tysan/lit.cfg
+++ b/compiler-rt/lib/tysan/lit.cfg
@@ -27,7 +27,7 @@ config.substitutions.append( ("%clangxx_tysan ", build_invocation(clang_tysan_cx
 config.suffixes = ['.c', '.cc', '.cpp']
 
 # TypeSanitizer tests are currently supported on Linux only.
-if config.host_os not in ['Linux']:
+if config.target_os not in ['Linux']:
   config.unsupported = True
 
 if config.target_arch != 'aarch64':
diff --git a/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py
index 57c0979e60962..b622e072bcbfb 100644
--- a/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py
+++ b/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Windows"]:
+if root.target_os not in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index 3da073332c458..96201e679b0a3 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -28,7 +28,7 @@ def get_required_attr(config, attr_name):
 # tests to prevent regressions.
 # Currently, detect_leaks for asan tests only work on Intel MacOS.
 if (
-    config.host_os == "Darwin"
+    config.target_os == "Darwin"
     and config.apple_platform == "osx"
     and config.target_arch == "x86_64"
 ):
@@ -45,7 +45,7 @@ def get_required_attr(config, attr_name):
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
 
-if config.host_os not in ["FreeBSD", "NetBSD"]:
+if config.target_os not in ["FreeBSD", "NetBSD"]:
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -125,17 +125,17 @@ def build_invocation(compile_flags, with_lto=False):
     ("%clangxx_asan_lto ", build_invocation(clang_asan_cxxflags, True))
 )
 if config.asan_dynamic:
-    if config.host_os in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
+    if config.target_os in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
         shared_libasan_path = os.path.join(
             config.compiler_rt_libdir,
             "libclang_rt.asan{}.so".format(config.target_suffix),
         )
-    elif config.host_os == "Darwin":
+    elif config.target_os == "Darwin":
         shared_libasan_path = os.path.join(
             config.compiler_rt_libdir,
             "libclang_rt.asan_{}_dynamic.dylib".format(config.apple_platform),
         )
-    elif config.host_os == "Windows":
+    elif config.target_os == "Windows":
         shared_libasan_path = os.path.join(
             config.compiler_rt_libdir,
             "clang_rt.asan_dynamic-{}.lib".format(config.target_suffix),
@@ -274,16 +274,16 @@ def build_invocation(compile_flags, with_lto=False):
     and (config.target_arch in ["x86_64", "i386", "i686", "aarch64"])
 )
 leak_detection_linux = (
-    (config.host_os == "Linux")
+    (config.target_os == "Linux")
     and (not config.android)
     and (config.target_arch in ["x86_64", "i386", "riscv64", "loongarch64"])
 )
 leak_detection_mac = (
-    (config.host_os == "Darwin")
+    (config.target_os == "Darwin")
     and (config.apple_platform == "osx")
     and (config.target_arch == "x86_64")
 )
-leak_detection_netbsd = (config.host_os == "NetBSD") and (
+leak_detection_netbsd = (config.target_os == "NetBSD") and (
     config.target_arch in ["x86_64", "i386"]
 )
 if (
@@ -296,7 +296,7 @@ def build_invocation(compile_flags, with_lto=False):
 
 # Add the RT libdir to PATH directly so that we can successfully run the gtest
 # binary to list its tests.
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     os.environ["PATH"] = os.path.pathsep.join(
         [config.compiler_rt_libdir, os.environ.get("PATH", "")]
     )
@@ -310,10 +310,10 @@ def build_invocation(compile_flags, with_lto=False):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.suffixes.append(".mm")
 
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     config.substitutions.append(("%fPIC", ""))
     config.substitutions.append(("%fPIE", ""))
     config.substitutions.append(("%pie", ""))
@@ -323,11 +323,11 @@ def build_invocation(compile_flags, with_lto=False):
     config.substitutions.append(("%pie", "-pie"))
 
 # Only run the tests on supported OSs.
-if config.host_os not in ["Linux", "Darwin", "FreeBSD", "SunOS", "Windows", "NetBSD"]:
+if config.target_os not in ["Linux", "Darwin", "FreeBSD", "SunOS", "Windows", "NetBSD"]:
     config.unsupported = True
 
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/asan_abi/lit.cfg.py b/compiler-rt/test/asan_abi/lit.cfg.py
index 5bc1881ed9c32..dd99a5373e7b6 100644
--- a/compiler-rt/test/asan_abi/lit.cfg.py
+++ b/compiler-rt/test/asan_abi/lit.cfg.py
@@ -68,7 +68,7 @@ def build_invocation(compile_flags):
 
 config.suffixes = ['.c', '.cpp']
 
-if config.host_os == 'Darwin':
+if config.target_os == 'Darwin':
   config.suffixes.append('.mm')
 else:
   config.unsupported = True
diff --git a/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/builtins/Unit/lit.cfg.py b/compiler-rt/test/builtins/Unit/lit.cfg.py
index c030f89c66e42..59da054848f3c 100644
--- a/compiler-rt/test/builtins/Unit/lit.cfg.py
+++ b/compiler-rt/test/builtins/Unit/lit.cfg.py
@@ -80,10 +80,10 @@ def get_libgcc_file_name():
         config.compiler_rt_libdir, "clang_rt.builtins%s.lib " % config.target_suffix
     )
     config.substitutions.append(("%librt ", base_lib))
-elif config.host_os == "Darwin":
+elif config.target_os == "Darwin":
     base_lib = os.path.join(config.compiler_rt_libdir, "libclang_rt.osx.a ")
     config.substitutions.append(("%librt ", base_lib + " -lSystem "))
-elif config.host_os == "Windows":
+elif config.target_os == "Windows":
     base_lib = os.path.join(
         config.compiler_rt_libdir, "libclang_rt.builtins%s.a" % config.target_suffix
     )
@@ -104,7 +104,7 @@ def get_libgcc_file_name():
     if sys.platform in ["win32"] and execute_external:
         # Don't pass dosish path separator to msys bash.exe.
         base_lib = base_lib.replace("\\", "/")
-    if config.host_os == "Haiku":
+    if config.target_os == "Haiku":
         config.substitutions.append(("%librt ", base_lib + " -lroot "))
     else:
         config.substitutions.append(("%librt ", base_lib + " -lc -lm "))
diff --git a/compiler-rt/test/builtins/lit.cfg.py b/compiler-rt/test/builtins/lit.cfg.py
index 9300488c8428d..6491f4735b9e6 100644
--- a/compiler-rt/test/builtins/lit.cfg.py
+++ b/compiler-rt/test/builtins/lit.cfg.py
@@ -21,7 +21,7 @@
     ("%clang ", " " + config.clang + " " + " ".join(extra_flags) + " ")
 )
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.substitutions.append(
         ("%macos_version_major", str(config.darwin_osx_version[0]))
     )
diff --git a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py
index dceb7cde7218b..2778d8c995fd1 100644
--- a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py
+++ b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py
@@ -6,7 +6,7 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux", "FreeBSD", "NetBSD"]:
+if root.target_os not in ["Linux", "FreeBSD", "NetBSD"]:
     config.unsupported = True
 
 # Android O (API level 26) has support for cross-dso cfi in libdl.so.
diff --git a/compiler-rt/test/ctx_profile/lit.cfg.py b/compiler-rt/test/ctx_profile/lit.cfg.py
index 74d9bfd11ae28..75367d95a47bd 100644
--- a/compiler-rt/test/ctx_profile/lit.cfg.py
+++ b/compiler-rt/test/ctx_profile/lit.cfg.py
@@ -7,7 +7,7 @@
 import lit.formats
 
 # Only run the tests on supported OSs.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
 
 
diff --git a/compiler-rt/test/dfsan/lit.cfg.py b/compiler-rt/test/dfsan/lit.cfg.py
index e947c51f99a5b..b26ff3e367942 100644
--- a/compiler-rt/test/dfsan/lit.cfg.py
+++ b/compiler-rt/test/dfsan/lit.cfg.py
@@ -25,5 +25,5 @@ def build_invocation(compile_flags):
 config.suffixes = [".c", ".cpp"]
 
 # DataFlowSanitizer tests are currently supported on Linux only.
-if not (config.host_os in ["Linux"] and config.target_arch in ["aarch64", "x86_64", "loongarch64"]):
+if not (config.target_os in ["Linux"] and config.target_arch in ["aarch64", "x86_64", "loongarch64"]):
     config.unsupported = True
diff --git a/compiler-rt/test/fuzzer/lit.cfg.py b/compiler-rt/test/fuzzer/lit.cfg.py
index 75d4cf2e4c529..1689f53d0b021 100644
--- a/compiler-rt/test/fuzzer/lit.cfg.py
+++ b/compiler-rt/test/fuzzer/lit.cfg.py
@@ -149,5 +149,5 @@ def generate_compiler_cmd(is_cpp=True, fuzzer_enabled=True, msan_enabled=False):
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/gwp_asan/lit.cfg.py b/compiler-rt/test/gwp_asan/lit.cfg.py
index 7f68682162e3f..1592cf400023e 100644
--- a/compiler-rt/test/gwp_asan/lit.cfg.py
+++ b/compiler-rt/test/gwp_asan/lit.cfg.py
@@ -67,5 +67,5 @@ def build_invocation(compile_flags):
 )
 
 # GWP-ASan tests are currently supported on Linux only.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/hwasan/lit.cfg.py b/compiler-rt/test/hwasan/lit.cfg.py
index bbf23e683240a..3a1c8e1466aea 100644
--- a/compiler-rt/test/hwasan/lit.cfg.py
+++ b/compiler-rt/test/hwasan/lit.cfg.py
@@ -86,5 +86,5 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Linux", "Android"] or not config.has_lld:
+if config.target_os not in ["Linux", "Android"] or not config.has_lld:
     config.unsupported = True
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 877718c703ba7..8328b407dcc36 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -66,7 +66,7 @@ def find_compiler_libdir():
     # Fall back for older AppleClang that doesn't support `-print-runtime-dir`
     # Note `-print-file-name=<path to compiler-rt lib>` was broken for Apple
     # platforms so we can't use that approach here (see https://reviews.llvm.org/D101682).
-    if config.host_os == "Darwin":
+    if config.target_os == "Darwin":
         lib_dir, _ = get_path_from_clang(["-print-file-name=lib"], allow_failure=False)
         runtime_dir = os.path.join(lib_dir, "darwin")
         if not os.path.exists(runtime_dir):
@@ -312,7 +312,7 @@ def push_dynamic_library_lookup_path(config, new_path):
 if platform.system() == "Windows" and target_is_msvc:
     config.environment["LIB"] = os.environ["LIB"]
 
-config.available_features.add(config.host_os.lower())
+config.available_features.add(config.target_os.lower())
 
 if config.target_triple.startswith("ppc") or config.target_triple.startswith("powerpc"):
     config.available_features.add("ppc")
@@ -344,7 +344,7 @@ def push_dynamic_library_lookup_path(config, new_path):
     )
 )
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     nb_commands_dir = os.path.join(
         config.compiler_rt_src_root, "test", "sanitizer_common", "netbsd_commands"
     )
@@ -395,7 +395,7 @@ def get_ios_commands_dir():
         if sanitizer not in config.environment:
             config.environment[sanitizer] = symbolizer_path
 
-env_utility = "/opt/freeware/bin/env" if config.host_os == "AIX" else "env"
+env_utility = "/opt/freeware/bin/env" if config.target_os == "AIX" else "env"
 env_unset_command = " ".join(f"-u {var}" for var in tool_symbolizer_path_list)
 config.substitutions.append(
     ("%env_unset_tool_symbolizer_path", f"{env_utility} {env_unset_command}")
@@ -410,7 +410,7 @@ def get_ios_commands_dir():
     lit_config.warning("%device_rm is not implemented")
     config.substitutions.append(("%device_rm", "echo "))
     config.compile_wrapper = ""
-elif config.host_os == "Darwin" and config.apple_platform != "osx":
+elif config.target_os == "Darwin" and config.apple_platform != "osx":
     # Darwin tests can be targetting macOS, a device or a simulator. All devices
     # are declared as "ios", even for iOS derivatives (tvOS, watchOS). Similarly,
     # all simulators are "iossim". See the table below.
@@ -498,12 +498,12 @@ def get_ios_commands_dir():
     config.compile_wrapper = ""
 
 # Define CHECK-%os to check for OS-dependent output.
-config.substitutions.append(("CHECK-%os", ("CHECK-" + config.host_os)))
+config.substitutions.append(("CHECK-%os", ("CHECK-" + config.target_os)))
 
 # Define %arch to check for architecture-dependent output.
 config.substitutions.append(("%arch", (config.host_arch)))
 
-if config.host_os == "Windows":
+if os.name == "nt":
     # FIXME: This isn't quite right. Specifically, it will succeed if the program
     # does not crash but exits with a non-zero exit code. We ought to merge
     # KillTheDoctor and not --crash to make the latter more useful and remove the
@@ -519,7 +519,7 @@ def get_ios_commands_dir():
     config.available_features.add(target_arch + "-target-arch")
     if target_arch in ["x86_64", "i386"]:
         config.available_features.add("x86-target-arch")
-    config.available_features.add(target_arch + "-" + config.host_os.lower())
+    config.available_features.add(target_arch + "-" + config.target_os.lower())
 
 compiler_rt_debug = getattr(config, "compiler_rt_debug", False)
 if not compiler_rt_debug:
@@ -565,7 +565,7 @@ def get_ios_commands_dir():
     ("%darwin_min_target_with_tls_support", "%min_macos_deployment_target=10.12")
 )
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     osx_version = (10, 0, 0)
     try:
         osx_version = subprocess.check_output(
@@ -602,12 +602,12 @@ def get_ios_commands_dir():
 
     def get_macos_aligned_version(macos_vers):
         platform = config.apple_platform
-        if platform == "osx":
+        macos_major, macos_minor = macos_vers
+
+        if platform == "osx" or macos_major >= 26:
             return macos_vers
 
-        macos_major, macos_minor = macos_vers
         assert macos_major >= 10
-
         if macos_major == 10:  # macOS 10.x
             major = macos_minor
             minor = 0
@@ -708,7 +708,7 @@ def get_macos_aligned_version(macos_vers):
     config.substitutions.append(("%push_to_device", "echo "))
     config.substitutions.append(("%adb_shell", "echo "))
 
-if config.host_os == "Linux":
+if config.target_os == "Linux":
     def add_glibc_versions(ver_string):
         if config.android:
             return
@@ -806,10 +806,10 @@ def is_windows_lto_supported():
     return os.path.exists(os.path.join(config.llvm_tools_dir, "lld-link.exe"))
 
 
-if config.host_os == "Darwin" and is_darwin_lto_supported():
+if config.target_os == "Darwin" and is_darwin_lto_supported():
     config.lto_supported = True
     config.lto_flags = ["-Wl,-lto_library," + liblto_path()]
-elif config.host_os in ["Linux", "FreeBSD", "NetBSD"]:
+elif config.target_os in ["Linux", "FreeBSD", "NetBSD"]:
     config.lto_supported = False
     if config.use_lld and is_lld_lto_supported():
         config.lto_supported = True
@@ -822,7 +822,7 @@ def is_windows_lto_supported():
             config.lto_flags = ["-fuse-ld=lld"]
         else:
             config.lto_flags = ["-fuse-ld=gold"]
-elif config.host_os == "Windows" and is_windows_lto_supported():
+elif config.target_os == "Windows" and is_windows_lto_supported():
     config.lto_supported = True
     config.lto_flags = ["-fuse-ld=lld"]
 else:
@@ -871,7 +871,7 @@ def is_windows_lto_supported():
 # Note that substitutions with numbers have to be defined first to avoid
 # being subsumed by substitutions with smaller postfix.
 for postfix in ["2", "1", ""]:
-    if config.host_os == "Darwin":
+    if config.target_os == "Darwin":
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -884,7 +884,7 @@ def is_windows_lto_supported():
                 "-install_name @rpath/`basename %dynamiclib{}`".format(postfix),
             )
         )
-    elif config.host_os in ("FreeBSD", "NetBSD", "OpenBSD"):
+    elif config.target_os in ("FreeBSD", "NetBSD", "OpenBSD"):
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -893,7 +893,7 @@ def is_windows_lto_supported():
             )
         )
         config.substitutions.append(("%ld_flags_rpath_so" + postfix, ""))
-    elif config.host_os == "Linux":
+    elif config.target_os == "Linux":
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -901,7 +901,7 @@ def is_windows_lto_supported():
             )
         )
         config.substitutions.append(("%ld_flags_rpath_so" + postfix, ""))
-    elif config.host_os == "SunOS":
+    elif config.target_os == "SunOS":
         config.substitutions.append(
             (
                 "%ld_flags_rpath_exe" + postfix,
@@ -923,7 +923,7 @@ def is_windows_lto_supported():
     config.substitutions.append(("%xdynamiclib_namespec", "%basename_t.dynamic"))
 
 config.default_sanitizer_opts = []
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     config.default_sanitizer_opts += ["abort_on_error=0"]
@@ -983,7 +983,7 @@ def is_windows_lto_supported():
 elif config.use_lld and (not config.has_lld):
     config.unsupported = True
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     if getattr(config, "darwin_linker_version", None):
         extra_cflags += ["-mlinker-version=" + config.darwin_linker_version]
 
@@ -998,7 +998,7 @@ def is_windows_lto_supported():
 )
 config.target_cflags = " " + " ".join(target_cflags + extra_cflags) + " "
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.substitutions.append(
         (
             "%get_pid_from_output",
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 04d1a4df5a54f..2b4b72bc895c5 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -10,7 +10,7 @@ set_default("target_triple", "@COMPILER_RT_DEFAULT_TARGET_TRIPLE@")
 set_default("target_cflags", "@COMPILER_RT_TEST_COMPILER_CFLAGS@")
 set_default("host_arch", "@HOST_ARCH@")
 set_default("target_arch", "@COMPILER_RT_DEFAULT_TARGET_ARCH@")
-set_default("host_os", "@HOST_OS@")
+set_default("target_os", "@HOST_OS@")
 set_default("llvm_build_mode", "@LLVM_BUILD_MODE@")
 set_default("llvm_src_root", "@LLVM_MAIN_SRC_DIR@")
 set_default("llvm_obj_root", "@LLVM_BINARY_DIR@")
diff --git a/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/lsan/lit.common.cfg.py b/compiler-rt/test/lsan/lit.common.cfg.py
index 9426b7d108bbf..1e2679438b114 100644
--- a/compiler-rt/test/lsan/lit.common.cfg.py
+++ b/compiler-rt/test/lsan/lit.common.cfg.py
@@ -34,7 +34,7 @@ def get_required_attr(config, attr_name):
     config.name = "LeakSanitizer-AddressSanitizer"
     lsan_cflags = ["-fsanitize=address"]
     config.available_features.add("asan")
-    if config.host_os == "NetBSD":
+    if config.target_os == "NetBSD":
         config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
 elif lsan_lit_test_mode == "HWAddressSanitizer":
     config.name = "LeakSanitizer-HWAddressSanitizer"
@@ -42,7 +42,7 @@ def get_required_attr(config, attr_name):
     if target_arch == "x86_64":
         lsan_cflags = lsan_cflags + ["-fsanitize-hwaddress-experimental-aliasing"]
     config.available_features.add("hwasan")
-    if config.host_os == "NetBSD":
+    if config.target_os == "NetBSD":
         config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
 else:
     lit_config.fatal("Unknown LSan test mode: %r" % lsan_lit_test_mode)
@@ -51,7 +51,7 @@ def get_required_attr(config, attr_name):
 # Platform-specific default LSAN_OPTIONS for lit tests.
 default_common_opts_str = ":".join(list(config.default_sanitizer_opts))
 default_lsan_opts = default_common_opts_str + ":detect_leaks=1"
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     # Also, make sure we do not overwhelm the syslog while testing.
@@ -101,7 +101,7 @@ def build_invocation(compile_flags):
 )
 supported_linux = (
     (not config.android)
-    and config.host_os == "Linux"
+    and config.target_os == "Linux"
     and config.host_arch
     in [
         "aarch64",
@@ -117,8 +117,8 @@ def build_invocation(compile_flags):
         "loongarch64",
     ]
 )
-supported_darwin = config.host_os == "Darwin" and config.target_arch in ["x86_64"]
-supported_netbsd = config.host_os == "NetBSD" and config.target_arch in [
+supported_darwin = config.target_os == "Darwin" and config.target_arch in ["x86_64"]
+supported_netbsd = config.target_os == "NetBSD" and config.target_arch in [
     "x86_64",
     "i386",
 ]
diff --git a/compiler-rt/test/memprof/lit.cfg.py b/compiler-rt/test/memprof/lit.cfg.py
index 4057da0c65b51..e28507be4dc9e 100644
--- a/compiler-rt/test/memprof/lit.cfg.py
+++ b/compiler-rt/test/memprof/lit.cfg.py
@@ -106,7 +106,7 @@ def build_invocation(compile_flags):
 config.substitutions.append(("%pie", "-pie"))
 
 # Only run the tests on supported OSs.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
 
 if not config.parallelism_group:
diff --git a/compiler-rt/test/metadata/lit.cfg.py b/compiler-rt/test/metadata/lit.cfg.py
index 73ba27ad3a4e2..9980e93b3a6ec 100644
--- a/compiler-rt/test/metadata/lit.cfg.py
+++ b/compiler-rt/test/metadata/lit.cfg.py
@@ -5,5 +5,5 @@
 config.suffixes = [".cpp"]
 # Binary metadata is currently emitted only for ELF binaries
 # and sizes of stack arguments depend on the arch.
-if config.host_os not in ["Linux"] or config.target_arch not in ["x86_64"]:
+if config.target_os not in ["Linux"] or config.target_arch not in ["x86_64"]:
     config.unsupported = True
diff --git a/compiler-rt/test/msan/Linux/lit.local.cfg.py b/compiler-rt/test/msan/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/msan/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/msan/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/msan/lit.cfg.py b/compiler-rt/test/msan/lit.cfg.py
index 361be79e2557e..d9e83c67b84c8 100644
--- a/compiler-rt/test/msan/lit.cfg.py
+++ b/compiler-rt/test/msan/lit.cfg.py
@@ -20,7 +20,7 @@
     + config.debug_info_flags
 )
 # Some Msan tests leverage backtrace() which requires libexecinfo on FreeBSD.
-if config.host_os == "FreeBSD":
+if config.target_os == "FreeBSD":
     clang_msan_cflags += ["-lexecinfo", "-fPIC"]
 # On SystemZ we need -mbackchain to make the fast unwinder work.
 if config.target_arch == "s390x":
@@ -44,7 +44,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Linux", "NetBSD", "FreeBSD"]:
+if config.target_os not in ["Linux", "NetBSD", "FreeBSD"]:
     config.unsupported = True
 
 # For mips64, mips64el we have forced store_context_size to 1 because these
@@ -55,5 +55,5 @@ def build_invocation(compile_flags):
 else:
     config.substitutions.append(("CHECK-%short-stack", "CHECK-FULL-STACK"))
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/nsan/lit.cfg.py b/compiler-rt/test/nsan/lit.cfg.py
index 2d67911a7d5d8..8225c85c41b81 100644
--- a/compiler-rt/test/nsan/lit.cfg.py
+++ b/compiler-rt/test/nsan/lit.cfg.py
@@ -32,5 +32,5 @@ def build_invocation(compile_flags):
 )
 
 # NSan tests are currently supported on Linux only.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py
index b455a936e7cc1..2e3d36c446714 100644
--- a/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "Darwin":
+if config.root.target_os != "Darwin":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py
index e9b1b38ccacd1..0efdb55dc77f4 100644
--- a/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "FreeBSD":
+if config.root.target_os != "FreeBSD":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py
index 7d85fa3fce392..32e5cfdb141ae 100644
--- a/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "Linux":
+if config.root.target_os != "Linux":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py
index 6d4e7da813641..99d4464cf9e77 100644
--- a/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py
+++ b/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py
@@ -1,2 +1,2 @@
-if config.root.host_os != "Windows":
+if config.root.target_os != "Windows":
     config.unsupported = True
diff --git a/compiler-rt/test/orc/lit.cfg.py b/compiler-rt/test/orc/lit.cfg.py
index 7a6eb4e7de325..3c3badb642ff7 100644
--- a/compiler-rt/test/orc/lit.cfg.py
+++ b/compiler-rt/test/orc/lit.cfg.py
@@ -18,11 +18,11 @@
     config.available_features.add("host-arch-compatible")
 
 # If the target OS hasn't been set then assume host.
-if not config.target_os:
-    config.target_os = config.host_os
+if not config.orc_test_target_os:
+    config.orc_test_target_os = config.target_os
 
 config.test_target_is_host_executable = (
-    config.target_os == config.host_os and host_arch_compatible
+    config.orc_test_target_os == config.target_os and host_arch_compatible
 )
 
 # Assume that llvm-jitlink is in the config.llvm_tools_dir.
@@ -31,7 +31,7 @@
     config.compiler_rt_obj_root, "lib/orc/tests/tools/orc-rt-executor"
 )
 lli = os.path.join(config.llvm_tools_dir, "lli")
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     orc_rt_path = "%s/liborc_rt_osx.a" % config.compiler_rt_libdir
 else:
     orc_rt_path = "%s/liborc_rt%s.a" % (config.compiler_rt_libdir, config.target_suffix)
@@ -53,7 +53,7 @@ def build_invocation(compile_flags):
 config.substitutions.append(
     ("%clang_cl ", build_invocation(["--driver-mode=cl"] + [config.target_cflags]))
 )
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     config.substitutions.append(
         (
             "%llvm_jitlink",
@@ -86,7 +86,7 @@ def build_invocation(compile_flags):
 # Exclude Inputs directories.
 config.excludes = ["Inputs"]
 
-if config.host_os not in ["Darwin", "FreeBSD", "Linux", "Windows"]:
+if config.target_os not in ["Darwin", "FreeBSD", "Linux", "Windows"]:
     config.unsupported = True
 
 # Ask llvm-config about assertion mode.
diff --git a/compiler-rt/test/orc/lit.site.cfg.py.in b/compiler-rt/test/orc/lit.site.cfg.py.in
index a33ef3d7d7207..d0625f6ace15c 100644
--- a/compiler-rt/test/orc/lit.site.cfg.py.in
+++ b/compiler-rt/test/orc/lit.site.cfg.py.in
@@ -5,7 +5,8 @@ config.name_suffix = "@ORC_TEST_CONFIG_SUFFIX@"
 config.orc_lit_source_dir = "@ORC_LIT_SOURCE_DIR@"
 config.target_cflags = "@ORC_TEST_TARGET_CFLAGS@"
 config.target_arch = "@ORC_TEST_TARGET_ARCH@"
-config.target_os = "@ORC_TEST_TARGET_OS@"
+# FIXME: Remove this variable, the target OS is available in config.target_os.
+config.orc_test_target_os = "@ORC_TEST_TARGET_OS@"
 config.built_with_llvm = ("@COMPILER_RT_STANDALONE_BUILD@" != "TRUE")
 config.libunwind_shared = "@LIBUNWIND_ENABLE_SHARED@"
 config.libunwind_install_dir = "@LLVM_BINARY_DIR@/@LIBUNWIND_INSTALL_LIBRARY_DIR@"
diff --git a/compiler-rt/test/profile/AIX/lit.local.cfg.py b/compiler-rt/test/profile/AIX/lit.local.cfg.py
index 55462708e3b6c..3337c692bd0d7 100644
--- a/compiler-rt/test/profile/AIX/lit.local.cfg.py
+++ b/compiler-rt/test/profile/AIX/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["AIX"]:
+if root.target_os not in ["AIX"]:
     config.unsupported = True
diff --git a/compiler-rt/test/profile/Darwin/lit.local.cfg.py b/compiler-rt/test/profile/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/profile/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/profile/Linux/lit.local.cfg.py b/compiler-rt/test/profile/Linux/lit.local.cfg.py
index c1e89581a1ab9..4bce33db9bbf7 100644
--- a/compiler-rt/test/profile/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Linux/lit.local.cfg.py
@@ -42,7 +42,7 @@ def is_gold_linker_available():
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"] or not is_gold_linker_available():
+if root.target_os not in ["Linux"] or not is_gold_linker_available():
     config.unsupported = True
 
 if config.have_curl:
diff --git a/compiler-rt/test/profile/Posix/lit.local.cfg.py b/compiler-rt/test/profile/Posix/lit.local.cfg.py
index 17a67689192d0..62ee3cbb466c4 100644
--- a/compiler-rt/test/profile/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Posix/lit.local.cfg.py
@@ -6,12 +6,12 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
 
 # AIX usually usually makes use of an explicit export list when linking a shared
 # object, since the linker doesn't export anything by default.
-if root.host_os in ["AIX"]:
+if root.target_os in ["AIX"]:
     config.substitutions.append(("%shared_linker_xopts", "-Wl,-bE:shr.exp"))
 else:
     config.substitutions.append(("%shared_linker_xopts", ""))
diff --git a/compiler-rt/test/profile/Windows/lit.local.cfg.py b/compiler-rt/test/profile/Windows/lit.local.cfg.py
index 57c0979e60962..b622e072bcbfb 100644
--- a/compiler-rt/test/profile/Windows/lit.local.cfg.py
+++ b/compiler-rt/test/profile/Windows/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Windows"]:
+if root.target_os not in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index c9a716abeccd8..df7f11e2b286b 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -30,7 +30,7 @@ def get_required_attr(config, attr_name):
 
 target_is_msvc = bool(re.match(r".*-windows-msvc$", config.target_triple))
 
-if config.host_os in ["Linux"]:
+if config.target_os in ["Linux"]:
     extra_link_flags = ["-ldl"]
 elif target_is_msvc:
     # InstrProf is incompatible with incremental linking. Disable it as a
@@ -154,7 +154,7 @@ def exclude_unsupported_files_for_aix(dirname):
     )
 )
 
-if config.host_os not in [
+if config.target_os not in [
     "Windows",
     "Darwin",
     "FreeBSD",
@@ -167,10 +167,10 @@ def exclude_unsupported_files_for_aix(dirname):
     config.unsupported = True
 
 config.substitutions.append(
-    ("%shared_lib_flag", "-dynamiclib" if (config.host_os == "Darwin") else "-shared")
+    ("%shared_lib_flag", "-dynamiclib" if (config.target_os == "Darwin") else "-shared")
 )
 
-if config.host_os in ["AIX"]:
+if config.target_os in ["AIX"]:
     config.available_features.add("system-aix")
     exclude_unsupported_files_for_aix(config.test_source_root)
     exclude_unsupported_files_for_aix(config.test_source_root + "/Posix")
@@ -184,5 +184,5 @@ def exclude_unsupported_files_for_aix(dirname):
 if config.have_curl:
     config.available_features.add("curl")
 
-if config.host_os in ("AIX", "Darwin", "Linux"):
+if config.target_os in ("AIX", "Darwin", "Linux"):
     config.available_features.add("continuous-mode")
diff --git a/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in b/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in
index 59e1e10360b52..41fcb32e5009b 100644
--- a/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in
+++ b/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in
@@ -15,7 +15,7 @@ config.test_source_root = config.test_exec_root
 if not config.parallelism_group:
   config.parallelism_group = 'shadow-memory'
 
-if config.host_os == 'Darwin':
+if config.target_os == 'Darwin':
   # On Darwin, we default to ignore_noninstrumented_modules=1, which also
   # suppresses some races the tests are supposed to find.  See rtsan/lit.cfg.py.
   if 'RTSAN_OPTIONS' in config.environment:
diff --git a/compiler-rt/test/rtsan/lit.cfg.py b/compiler-rt/test/rtsan/lit.cfg.py
index 7c75515a7608d..6d880c10ecd45 100644
--- a/compiler-rt/test/rtsan/lit.cfg.py
+++ b/compiler-rt/test/rtsan/lit.cfg.py
@@ -6,7 +6,7 @@
 
 default_rtsan_opts = "atexit_sleep_ms=0"
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     default_rtsan_opts += ":abort_on_error=0"
@@ -36,7 +36,7 @@ def build_invocation(compile_flags):
 llvm_rtsan = os.path.join(config.llvm_tools_dir, "llvm-rtsan")
 
 # Setup substitutions.
-if config.host_os == "Linux":
+if config.target_os == "Linux":
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -52,7 +52,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
+if config.target_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
     config.unsupported = True
 elif "64" not in config.host_arch:
     if "arm" in config.host_arch:
@@ -61,5 +61,5 @@ def build_invocation(compile_flags):
     else:
         config.unsupported = True
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_nomprotect_prefix))
diff --git a/compiler-rt/test/safestack/lit.cfg.py b/compiler-rt/test/safestack/lit.cfg.py
index 4ab9c1ce70bac..3f5565caa65c6 100644
--- a/compiler-rt/test/safestack/lit.cfg.py
+++ b/compiler-rt/test/safestack/lit.cfg.py
@@ -33,5 +33,5 @@
         )
     )
 
-if config.host_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
+if config.target_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py
index 520a963d01198..af82d30cf4de9 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py
index 0102001660cf1..d4948f04ef64e 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["FreeBSD"]:
+if root.target_os not in ["FreeBSD"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py
index 3cd1aa667343c..aa4438d04380a 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["NetBSD"]:
+if root.target_os not in ["NetBSD"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/sanitizer_common/lit.common.cfg.py b/compiler-rt/test/sanitizer_common/lit.common.cfg.py
index 88d3ea9bc5ad2..5614229d9a126 100644
--- a/compiler-rt/test/sanitizer_common/lit.common.cfg.py
+++ b/compiler-rt/test/sanitizer_common/lit.common.cfg.py
@@ -40,7 +40,7 @@
 config.available_features.add(config.tool_name)
 
 if (
-    config.host_os == "Linux"
+    config.target_os == "Linux"
     and config.tool_name == "lsan"
     and config.target_arch == "i386"
 ):
@@ -49,7 +49,7 @@
 if config.arm_thumb:
     config.available_features.add("thumb")
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     default_tool_options += ["abort_on_error=0"]
@@ -68,7 +68,7 @@
 
 extra_link_flags = []
 
-if config.host_os in ["Linux"]:
+if config.target_os in ["Linux"]:
     extra_link_flags += ["-ldl"]
 
 clang_cflags = config.debug_info_flags + tool_cflags + [config.target_cflags]
@@ -92,13 +92,13 @@ def build_invocation(compile_flags):
 
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["Linux", "Darwin", "NetBSD", "FreeBSD", "SunOS"]:
+if config.target_os not in ["Linux", "Darwin", "NetBSD", "FreeBSD", "SunOS"]:
     config.unsupported = True
 
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
 
 if os.path.exists("/etc/services"):
diff --git a/compiler-rt/test/scudo/lit.cfg.py b/compiler-rt/test/scudo/lit.cfg.py
index 5d45bd99804c7..b09c996e9ccc5 100644
--- a/compiler-rt/test/scudo/lit.cfg.py
+++ b/compiler-rt/test/scudo/lit.cfg.py
@@ -70,5 +70,5 @@ def build_invocation(compile_flags):
 )
 
 # Hardened Allocator tests are currently supported on Linux only.
-if config.host_os not in ["Linux"]:
+if config.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/shadowcallstack/lit.cfg.py b/compiler-rt/test/shadowcallstack/lit.cfg.py
index 70a6b16174c4b..5b95deb1b0986 100644
--- a/compiler-rt/test/shadowcallstack/lit.cfg.py
+++ b/compiler-rt/test/shadowcallstack/lit.cfg.py
@@ -32,5 +32,5 @@
     )
 )
 
-if config.host_os not in ["Linux"] or config.target_arch not in ["aarch64", "riscv64"]:
+if config.target_os not in ["Linux"] or config.target_arch not in ["aarch64", "riscv64"]:
     config.unsupported = True
diff --git a/compiler-rt/test/tsan/Darwin/lit.local.cfg.py b/compiler-rt/test/tsan/Darwin/lit.local.cfg.py
index 7bf80ac5e1375..876f0cd638bd2 100644
--- a/compiler-rt/test/tsan/Darwin/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/Darwin/lit.local.cfg.py
@@ -6,7 +6,7 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Darwin"]:
+if root.target_os not in ["Darwin"]:
     config.unsupported = True
 
 config.environment["TSAN_OPTIONS"] += ":ignore_noninstrumented_modules=1"
diff --git a/compiler-rt/test/tsan/Linux/lit.local.cfg.py b/compiler-rt/test/tsan/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/tsan/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in b/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in
index a9c6261ba48d4..b90af4f2c0d3a 100644
--- a/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in
+++ b/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in
@@ -15,7 +15,7 @@ config.test_source_root = config.test_exec_root
 if not config.parallelism_group:
   config.parallelism_group = 'shadow-memory'
 
-if config.host_os == 'Darwin':
+if config.target_os == 'Darwin':
   # On Darwin, we default to ignore_noninstrumented_modules=1, which also
   # suppresses some races the tests are supposed to find.  See tsan/lit.cfg.py.
   if 'TSAN_OPTIONS' in config.environment:
diff --git a/compiler-rt/test/tsan/libcxx/lit.local.cfg.py b/compiler-rt/test/tsan/libcxx/lit.local.cfg.py
index f4820dccb0109..b8d054e2de976 100644
--- a/compiler-rt/test/tsan/libcxx/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/libcxx/lit.local.cfg.py
@@ -8,5 +8,5 @@ def getRoot(config):
 
 # Only run if we have an instrumented libcxx.  On Darwin, run always (we have
 # interceptors to support the system-provided libcxx).
-if not root.has_libcxx and root.host_os != "Darwin":
+if not root.has_libcxx and root.target_os != "Darwin":
     config.unsupported = True
diff --git a/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py b/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py
index a7653f4305952..27edf611a0522 100644
--- a/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py
+++ b/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py
@@ -14,5 +14,5 @@ def getRoot(config):
 else:
     config.unsupported = True
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.environment["TSAN_OPTIONS"] += ":ignore_noninstrumented_modules=1"
diff --git a/compiler-rt/test/tsan/lit.cfg.py b/compiler-rt/test/tsan/lit.cfg.py
index a93333e2e593d..8803a7bda9aa5 100644
--- a/compiler-rt/test/tsan/lit.cfg.py
+++ b/compiler-rt/test/tsan/lit.cfg.py
@@ -23,7 +23,7 @@ def get_required_attr(config, attr_name):
 # Setup environment variables for running ThreadSanitizer.
 default_tsan_opts = "atexit_sleep_ms=0"
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # On Darwin, we default to `abort_on_error=1`, which would make tests run
     # much slower. Let's override this and run lit tests with 'abort_on_error=0'.
     default_tsan_opts += ":abort_on_error=0"
@@ -61,7 +61,7 @@ def get_required_attr(config, attr_name):
 )
 # Add additional flags if we're using instrumented libc++.
 # Instrumented libcxx currently not supported on Darwin.
-if config.has_libcxx and config.host_os != "Darwin":
+if config.has_libcxx and config.target_os != "Darwin":
     # FIXME: Dehardcode this path somehow.
     libcxx_path = os.path.join(
         config.compiler_rt_obj_root,
@@ -86,7 +86,7 @@ def build_invocation(compile_flags):
 config.substitutions.append(("%clangxx_tsan ", build_invocation(clang_tsan_cxxflags)))
 
 # Define CHECK-%os to check for OS-dependent output.
-config.substitutions.append(("CHECK-%os", ("CHECK-" + config.host_os)))
+config.substitutions.append(("CHECK-%os", ("CHECK-" + config.target_os)))
 
 config.substitutions.append(
     (
@@ -101,7 +101,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp", ".m", ".mm"]
 
-if config.host_os not in ["FreeBSD", "Linux", "Darwin", "NetBSD"]:
+if config.target_os not in ["FreeBSD", "Linux", "Darwin", "NetBSD"]:
     config.unsupported = True
 
 if config.android:
@@ -110,5 +110,5 @@ def build_invocation(compile_flags):
 if not config.parallelism_group:
     config.parallelism_group = "shadow-memory"
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/tysan/lit.cfg.py b/compiler-rt/test/tysan/lit.cfg.py
index f38e0211639da..26846017b1957 100644
--- a/compiler-rt/test/tysan/lit.cfg.py
+++ b/compiler-rt/test/tysan/lit.cfg.py
@@ -71,7 +71,7 @@ def push_dynamic_library_lookup_path(config, new_path):
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
 
-if config.host_os not in ["FreeBSD", "NetBSD"]:
+if config.target_os not in ["FreeBSD", "NetBSD"]:
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -127,10 +127,10 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     config.suffixes.append(".mm")
 
-if config.host_os == "Windows":
+if config.target_os == "Windows":
     config.substitutions.append(("%fPIC", ""))
     config.substitutions.append(("%fPIE", ""))
     config.substitutions.append(("%pie", ""))
@@ -140,7 +140,7 @@ def build_invocation(compile_flags):
     config.substitutions.append(("%pie", "-pie"))
 
 # Only run the tests on supported OSs.
-if config.host_os not in [
+if config.target_os not in [
     "Linux",
     "Darwin",
 ]:
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py
index 63240c3962565..c43790b98f38a 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py
+++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os in ["Windows"]:
+if root.target_os in ["Windows"]:
     config.unsupported = True
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
index e69d15f5b141c..4342649532865 100644
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py
@@ -1,4 +1,4 @@
-if config.host_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD"]:
+if config.target_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD"]:
     config.unsupported = True
 # Work around "Cannot represent a difference across sections"
 if config.target_arch == "powerpc64":
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py
index 603ca0365068f..3ea05fa044356 100644
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py
@@ -6,5 +6,5 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_os not in ["Linux"]:
+if root.target_os not in ["Linux"]:
     config.unsupported = True
diff --git a/compiler-rt/test/ubsan/lit.common.cfg.py b/compiler-rt/test/ubsan/lit.common.cfg.py
index 04d6f24de5a9f..25e527903788e 100644
--- a/compiler-rt/test/ubsan/lit.common.cfg.py
+++ b/compiler-rt/test/ubsan/lit.common.cfg.py
@@ -74,7 +74,7 @@ def build_invocation(compile_flags):
 config.suffixes = [".c", ".cpp", ".m"]
 
 # Check that the host supports UndefinedBehaviorSanitizer tests
-if config.host_os not in [
+if config.target_os not in [
     "Linux",
     "Darwin",
     "FreeBSD",
@@ -90,5 +90,5 @@ def build_invocation(compile_flags):
 if ubsan_lit_test_mode in ["AddressSanitizer", "MemorySanitizer", "ThreadSanitizer"]:
     if not config.parallelism_group:
         config.parallelism_group = "shadow-memory"
-    if config.host_os == "NetBSD":
+    if config.target_os == "NetBSD":
         config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix))
diff --git a/compiler-rt/test/ubsan_minimal/lit.common.cfg.py b/compiler-rt/test/ubsan_minimal/lit.common.cfg.py
index 714241a580f9d..bcc0e46fbef91 100644
--- a/compiler-rt/test/ubsan_minimal/lit.common.cfg.py
+++ b/compiler-rt/test/ubsan_minimal/lit.common.cfg.py
@@ -35,7 +35,7 @@ def build_invocation(compile_flags):
 config.suffixes = [".c", ".cpp"]
 
 # Check that the host supports UndefinedBehaviorSanitizerMinimal tests
-if config.host_os not in [
+if config.target_os not in [
     "Linux",
     "FreeBSD",
     "NetBSD",
diff --git a/compiler-rt/test/xray/lit.cfg.py b/compiler-rt/test/xray/lit.cfg.py
index f73ae3acd7715..e56ed85d1d822 100644
--- a/compiler-rt/test/xray/lit.cfg.py
+++ b/compiler-rt/test/xray/lit.cfg.py
@@ -14,7 +14,7 @@
 # If libc++ was used to build XRAY libraries, libc++ is needed. Fix applied
 # to Linux only since -rpath may not be portable. This can be extended to
 # other platforms.
-if config.libcxx_used == "1" and config.host_os == "Linux":
+if config.libcxx_used == "1" and config.target_os == "Linux":
     clang_xray_cflags = clang_xray_cflags + (
         ["-L%s -lc++ -Wl,-rpath=%s" % (config.llvm_shlib_dir, config.llvm_shlib_dir)]
     )
@@ -30,7 +30,7 @@ def build_invocation(compile_flags):
 llvm_xray = os.path.join(config.llvm_tools_dir, "llvm-xray")
 
 # Setup substitutions.
-if config.host_os == "Linux":
+if config.target_os == "Linux":
     libdl_flag = "-ldl"
 else:
     libdl_flag = ""
@@ -56,7 +56,7 @@ def build_invocation(compile_flags):
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]
 
-if config.host_os not in ["FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
+if config.target_os not in ["FreeBSD", "Linux", "NetBSD", "OpenBSD"]:
     config.unsupported = True
 elif "64" not in config.host_arch:
     if "arm" in config.host_arch:
@@ -65,5 +65,5 @@ def build_invocation(compile_flags):
     else:
         config.unsupported = True
 
-if config.host_os == "NetBSD":
+if config.target_os == "NetBSD":
     config.substitutions.insert(0, ("%run", config.netbsd_nomprotect_prefix))
diff --git a/compiler-rt/unittests/lit.common.unit.cfg.py b/compiler-rt/unittests/lit.common.unit.cfg.py
index 557a42893ec15..93f417c1d50ae 100644
--- a/compiler-rt/unittests/lit.common.unit.cfg.py
+++ b/compiler-rt/unittests/lit.common.unit.cfg.py
@@ -42,7 +42,7 @@ def get_lit_conf(name, default=None):
 if "TEMP" in os.environ:
     config.environment["TEMP"] = os.environ["TEMP"]
 
-if config.host_os == "Darwin":
+if config.target_os == "Darwin":
     # Only run up to 3 processes that require shadow memory simultaneously on
     # 64-bit Darwin. Using more scales badly and hogs the system due to
     # inefficient handling of large mmap'd regions (terabytes) by the kernel.
diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 3e42e83c9e70a..30ccf452ac71f 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -10,7 +10,7 @@ config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"
-config.host_os = "@HOST_OS@"
+config.target_os = "@HOST_OS@"
 config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
 config.gwp_asan = @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@
 config.emulator = "@COMPILER_RT_EMULATOR@"
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py
index acbff53b8ed34..ecfc8ebcb1507 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/TestToolBase.py
@@ -58,6 +58,13 @@ def add_tool_arguments(self, parser, defaults):
             default=None,
             help="directory to save results (default: none)",
         )
+        parser.add_argument(
+            "--test-root-dir",
+            type=str,
+            metavar="<directory>",
+            default=None,
+            help="if passed, result names will include relative path from this directory",
+        )
 
     def handle_options(self, defaults):
         options = self.context.options
@@ -130,10 +137,10 @@ def _get_test_name(self, test_path):
         """Get the test name from either the test file, or the sub directory
         path it's stored in.
         """
-        # test names are distinguished by their relative path from the
-        # specified test path.
-        test_name = os.path.relpath(test_path, self.context.options.test_path)
-        if self._is_current_directory(test_name):
+        # Test names are either relative to an explicitly given test root directory, or else we just use the base name.
+        if self.context.options.test_root_dir is not None:
+            test_name = os.path.relpath(test_path, self.context.options.test_root_dir)
+        else:
             test_name = os.path.basename(test_path)
         return test_name
 
diff --git a/cross-project-tests/dtlto/ld-dtlto.c b/cross-project-tests/dtlto/ld-dtlto.c
index 3ee962346bd4a..7dffe2e015bcb 100644
--- a/cross-project-tests/dtlto/ld-dtlto.c
+++ b/cross-project-tests/dtlto/ld-dtlto.c
@@ -5,13 +5,11 @@
 
 // RUN: rm -rf %t && mkdir %t && cd %t
 
-// RUN: %clang --target=x86_64-linux-gnu -c -flto=thin %s -o dtlto.o
-
-// RUN: ld.lld dtlto.o \
-// RUN:   --thinlto-distributor=%python \
-// RUN:   --thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
-// RUN:   --thinlto-remote-compiler=%clang \
-// RUN:   --thinlto-remote-compiler-arg=--save-temps
+// RUN: %clang --target=x86_64-linux-gnu %s -flto=thin -fuse-ld=lld \
+// RUN:   -fthinlto-distributor=%python \
+// RUN:   -Xthinlto-distributor=%llvm_src_root/utils/dtlto/local.py \
+// RUN:   -Wl,--thinlto-remote-compiler-arg=--save-temps \
+// RUN:   -nostdlib -Werror
 
 /// Check that the required output files have been created.
 // RUN: ls | sort | FileCheck %s
@@ -22,18 +20,15 @@
 /// Linked ELF.
 // CHECK: {{^}}a.out{{$}}
 
-/// Produced by the bitcode compilation.
-// CHECK-NEXT: {{^}}dtlto.o{{$}}
-
 /// --save-temps output for the backend compilation.
-// CHECK-NEXT: {{^}}dtlto.s{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.0.preopt.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.1.promote.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.2.internalize.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.3.import.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.4.opt.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.5.precodegen.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.resolution.txt{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP:[a-zA-Z0-9_]+]].s{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.0.preopt.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.1.promote.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.2.internalize.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.3.import.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.4.opt.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.5.precodegen.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.resolution.txt{{$}}
 
 /// No files are expected after.
 // CHECK-NOT: {{.}}
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index b35c643ac898c..beeed55f65062 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -92,7 +92,13 @@ def get_required_attr(config, attr_name):
 # use_clang() and use_lld() respectively, so set them to "", if needed.
 if not hasattr(config, "clang_src_dir"):
     config.clang_src_dir = ""
-llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# Facebook T92898286
+should_test_bolt = get_required_attr(config, "llvm_test_bolt")
+if should_test_bolt:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"])
+else:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# End Facebook T92898286
 
 if not hasattr(config, "lld_src_dir"):
     config.lld_src_dir = ""
@@ -333,3 +339,9 @@ def get_clang_default_dwarf_version_string(triple):
 # Allow 'REQUIRES: XXX-registered-target' in tests.
 for arch in config.targets_to_build:
     config.available_features.add(arch.lower() + "-registered-target")
+
+# Facebook T92898286
+# Ensure the user's PYTHONPATH is included.
+if "PYTHONPATH" in os.environ:
+    config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"]
+# End Facebook T92898286
diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in
index 39458dfc79afd..2d53cd377f033 100644
--- a/cross-project-tests/lit.site.cfg.py.in
+++ b/cross-project-tests/lit.site.cfg.py.in
@@ -21,6 +21,10 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@"
 
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/flang-rt/include/flang-rt/runtime/non-tbp-dio.h b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
index 99d4113b6c7a8..26849298ec959 100644
--- a/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
+++ b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h
@@ -34,11 +34,16 @@ namespace Fortran::runtime::io {
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
+enum NonTbpDefinedIoFlags {
+  IsDtvArgPolymorphic = 1 << 0, // first dummy arg is CLASS(T)
+  DefinedIoInteger8 = 1 << 1, // -fdefault-integer-8 affected UNIT= & IOSTAT=
+};
+
 struct NonTbpDefinedIo {
   const typeInfo::DerivedType &derivedType;
   void (*subroutine)(); // null means no non-TBP defined I/O here
   common::DefinedIo definedIo;
-  bool isDtvArgPolymorphic; // first dummy arg is CLASS(T)
+  std::uint8_t flags;
 };
 
 struct NonTbpDefinedIoTable {
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index a8d39f4f8a1a3..93bca24a602b4 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -143,9 +143,9 @@ class SpecialBinding {
   // I/O procedures that are not type-bound.
   RT_API_ATTRS SpecialBinding(Which which, ProcedurePointer proc,
       std::uint8_t isArgDescSet, std::uint8_t isTypeBound,
-      std::uint8_t isArgContiguousSet)
+      std::uint8_t specialCaseFlag)
       : which_{which}, isArgDescriptorSet_{isArgDescSet},
-        isTypeBound_{isTypeBound}, isArgContiguousSet_{isArgContiguousSet},
+        isTypeBound_{isTypeBound}, specialCaseFlag_{specialCaseFlag},
         proc_{proc} {}
 
   static constexpr RT_API_ATTRS Which RankFinal(int rank) {
@@ -153,13 +153,11 @@ class SpecialBinding {
   }
 
   RT_API_ATTRS Which which() const { return which_; }
+  RT_API_ATTRS bool specialCaseFlag() const { return specialCaseFlag_; }
   RT_API_ATTRS bool IsArgDescriptor(int zeroBasedArg) const {
     return (isArgDescriptorSet_ >> zeroBasedArg) & 1;
   }
   RT_API_ATTRS bool IsTypeBound() const { return isTypeBound_ != 0; }
-  RT_API_ATTRS bool IsArgContiguous(int zeroBasedArg) const {
-    return (isArgContiguousSet_ >> zeroBasedArg) & 1;
-  }
   template <typename PROC>
   RT_API_ATTRS PROC GetProc(const Binding *bindings = nullptr) const {
     if (bindings && isTypeBound_ > 0) {
@@ -203,10 +201,10 @@ class SpecialBinding {
   // When a special binding is type-bound, this is its binding's index (plus 1,
   // so that 0 signifies that it's not type-bound).
   std::uint8_t isTypeBound_{0};
-  // True when a FINAL subroutine has a dummy argument that is an array that
-  // is CONTIGUOUS or neither assumed-rank nor assumed-shape.
-  std::uint8_t isArgContiguousSet_{0};
-
+  // For a FINAL subroutine, set when it has a dummy argument that is an array
+  // that is CONTIGUOUS or neither assumed-rank nor assumed-shape.
+  // For a defined I/O subroutine, set when UNIT= and IOSTAT= are INTEGER(8).
+  std::uint8_t specialCaseFlag_{0};
   ProcedurePointer proc_{nullptr};
 };
 
diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index aa75d4eff0511..d3cc6c2c99e79 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -62,6 +62,15 @@ void RTDEF(CUFDescriptorCheckSection)(
   }
 }
 
+void RTDEF(CUFSetAllocatorIndex)(
+    Descriptor *desc, int index, const char *sourceFile, int sourceLine) {
+  if (!desc) {
+    Terminator terminator{sourceFile, sourceLine};
+    terminator.Crash("descriptor is null");
+  }
+  desc->SetAllocIdx(index);
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index bb9a68abef2a7..4ed0baaa3d108 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -270,7 +270,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
       StaticDescriptor<maxRank, true, 10> statDesc;
       Descriptor &copy{statDesc.descriptor()};
       const Descriptor *argDescriptor{&descriptor};
-      if (descriptor.rank() > 0 && special->IsArgContiguous(0) &&
+      if (descriptor.rank() > 0 && special->specialCaseFlag() &&
           !descriptor.IsContiguous()) {
         // The FINAL subroutine demands a contiguous array argument, but
         // this INTENT(OUT) or intrinsic assignment LHS isn't contiguous.
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index b208cb2c397b3..3868c8ddce19f 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -67,13 +67,29 @@ static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
           ioType, io.mutableModes().inNamelist ? "NAMELIST" : "LISTDIRECTED");
       ioTypeLen = runtime::strlen(ioType);
     }
+    // V_LIST= argument
     StaticDescriptor<1, true> vListStatDesc;
     Descriptor &vListDesc{vListStatDesc.descriptor()};
-    vListDesc.Establish(TypeCategory::Integer, sizeof(int), nullptr, 1);
-    vListDesc.set_base_addr(edit.vList);
-    vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries);
-    vListDesc.GetDimension(0).SetByteStride(
-        static_cast<SubscriptValue>(sizeof(int)));
+    bool integer8{special.specialCaseFlag()};
+    std::int64_t vList64[edit.maxVListEntries];
+    if (integer8) {
+      // Convert v_list values to INTEGER(8)
+      for (int j{0}; j < edit.vListEntries; ++j) {
+        vList64[j] = edit.vList[j];
+      }
+      vListDesc.Establish(
+          TypeCategory::Integer, sizeof(std::int64_t), nullptr, 1);
+      vListDesc.set_base_addr(vList64);
+      vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries);
+      vListDesc.GetDimension(0).SetByteStride(
+          static_cast<SubscriptValue>(sizeof(std::int64_t)));
+    } else {
+      vListDesc.Establish(TypeCategory::Integer, sizeof(int), nullptr, 1);
+      vListDesc.set_base_addr(edit.vList);
+      vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries);
+      vListDesc.GetDimension(0).SetByteStride(
+          static_cast<SubscriptValue>(sizeof(int)));
+    }
     ExternalFileUnit *actualExternal{io.GetExternalFileUnit()};
     ExternalFileUnit *external{actualExternal};
     if (!external) {
@@ -84,8 +100,8 @@ static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
     ChildIo &child{external->PushChildIo(io)};
     // Child formatted I/O is nonadvancing by definition (F'2018 12.6.2.4).
     auto restorer{common::ScopedSet(io.mutableModes().nonAdvancing, true)};
-    int unit{external->unitNumber()};
-    int ioStat{IostatOk};
+    std::int32_t unit{external->unitNumber()};
+    std::int32_t ioStat{IostatOk};
     char ioMsg[100];
     Fortran::common::optional<std::int64_t> startPos;
     if (edit.descriptor == DataEdit::DefinedDerivedType &&
@@ -98,23 +114,45 @@ static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
         derived.binding().OffsetElement<const typeInfo::Binding>()};
     if (special.IsArgDescriptor(0)) {
       // "dtv" argument is "class(t)", pass a descriptor
-      auto *p{special.GetProc<void (*)(const Descriptor &, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
-          bindings)};
       StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
       Descriptor &elementDesc{elementStatDesc.descriptor()};
       elementDesc.Establish(
           derived, nullptr, 0, nullptr, CFI_attribute_pointer);
       elementDesc.set_base_addr(descriptor.Element<char>(subscripts));
-      p(elementDesc, unit, ioType, vListDesc, ioStat, ioMsg, ioTypeLen,
-          sizeof ioMsg);
+      if (integer8) { // 64-bit UNIT=/IOSTAT=
+        std::int64_t unit64{unit};
+        std::int64_t ioStat64{ioStat};
+        auto *p{special.GetProc<void (*)(const Descriptor &, std::int64_t &,
+            char *, const Descriptor &, std::int64_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(elementDesc, unit64, ioType, vListDesc, ioStat64, ioMsg, ioTypeLen,
+            sizeof ioMsg);
+        ioStat = ioStat64;
+      } else { // 32-bit UNIT=/IOSTAT=
+        auto *p{special.GetProc<void (*)(const Descriptor &, std::int32_t &,
+            char *, const Descriptor &, std::int32_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(elementDesc, unit, ioType, vListDesc, ioStat, ioMsg, ioTypeLen,
+            sizeof ioMsg);
+      }
     } else {
       // "dtv" argument is "type(t)", pass a raw pointer
-      auto *p{special.GetProc<void (*)(const void *, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
-          bindings)};
-      p(descriptor.Element<char>(subscripts), unit, ioType, vListDesc, ioStat,
-          ioMsg, ioTypeLen, sizeof ioMsg);
+      if (integer8) { // 64-bit UNIT= and IOSTAT=
+        std::int64_t unit64{unit};
+        std::int64_t ioStat64{ioStat};
+        auto *p{special.GetProc<void (*)(const void *, std::int64_t &, char *,
+            const Descriptor &, std::int64_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(descriptor.Element<char>(subscripts), unit64, ioType, vListDesc,
+            ioStat64, ioMsg, ioTypeLen, sizeof ioMsg);
+        ioStat = ioStat64;
+      } else { // 32-bit UNIT= and IOSTAT=
+        auto *p{special.GetProc<void (*)(const void *, std::int32_t &, char *,
+            const Descriptor &, std::int32_t &, char *, std::size_t,
+            std::size_t)>(bindings)};
+        p(descriptor.Element<char>(subscripts), unit, ioType, vListDesc, ioStat,
+            ioMsg, ioTypeLen, sizeof ioMsg);
+      }
     }
     handler.Forward(ioStat, ioMsg, sizeof ioMsg);
     external->PopChildIo(child);
@@ -458,11 +496,16 @@ RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
                       ? common::DefinedIo::ReadUnformatted
                       : common::DefinedIo::WriteUnformatted)}) {
             if (definedIo->subroutine) {
+              std::uint8_t isArgDescriptorSet{0};
+              if (definedIo->flags & IsDtvArgPolymorphic) {
+                isArgDescriptorSet = 1;
+              }
               typeInfo::SpecialBinding special{DIR == Direction::Input
                       ? typeInfo::SpecialBinding::Which::ReadUnformatted
                       : typeInfo::SpecialBinding::Which::WriteUnformatted,
-                  definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-                  false};
+                  definedIo->subroutine, isArgDescriptorSet,
+                  /*IsTypeBound=*/false,
+                  /*specialCaseFlag=*/!!(definedIo->flags & DefinedIoInteger8)};
               if (DefinedUnformattedIo(io_, instance_, *type, special)) {
                 anyIoTookPlace_ = true;
                 return StatOk;
@@ -719,8 +762,11 @@ RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
             nonTbpSpecial_.emplace(DIR == Direction::Input
                     ? typeInfo::SpecialBinding::Which::ReadFormatted
                     : typeInfo::SpecialBinding::Which::WriteFormatted,
-                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-                false);
+                definedIo->subroutine,
+                /*isArgDescriptorSet=*/
+                (definedIo->flags & IsDtvArgPolymorphic) ? 1 : 0,
+                /*isTypeBound=*/false,
+                /*specialCaseFlag=*/!!(definedIo->flags & DefinedIoInteger8));
             special_ = &*nonTbpSpecial_;
           }
         }
diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp
index e70dff3997233..f6c39468d5655 100644
--- a/flang-rt/lib/runtime/extensions.cpp
+++ b/flang-rt/lib/runtime/extensions.cpp
@@ -27,10 +27,7 @@
 #include <thread>
 
 #ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-
+#include "flang/Common/windows-include.h"
 #include <synchapi.h>
 
 inline void CtimeBuffer(char *buffer, size_t bufsize, const time_t cur_time,
@@ -309,6 +306,9 @@ void RTNAME(Perror)(const char *str) { perror(str); }
 // GNU extension function TIME()
 std::int64_t RTNAME(time)() { return time(nullptr); }
 
+// MCLOCK: returns accumulated CPU time in ticks
+std::int32_t FORTRAN_PROCEDURE_NAME(mclock)() { return std::clock(); }
+
 // Extension procedures related to I/O
 
 namespace io {
diff --git a/flang-rt/lib/runtime/non-tbp-dio.cpp b/flang-rt/lib/runtime/non-tbp-dio.cpp
index 72101b06e0c6e..d516526033c27 100644
--- a/flang-rt/lib/runtime/non-tbp-dio.cpp
+++ b/flang-rt/lib/runtime/non-tbp-dio.cpp
@@ -17,7 +17,7 @@ const NonTbpDefinedIo *NonTbpDefinedIoTable::Find(
   for (const auto *p{item}; j-- > 0; ++p) {
     if (&p->derivedType == &type && p->definedIo == definedIo) {
       return p;
-    } else if (p->isDtvArgPolymorphic) {
+    } else if (p->flags & IsDtvArgPolymorphic) {
       for (const typeInfo::DerivedType *t{type.GetParentType()}; t;
            t = t->GetParentType()) {
         if (&p->derivedType == t && p->definedIo == definedIo) {
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 3e1d7c9c3c788..50123f4cf321c 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -330,7 +330,7 @@ FILE *SpecialBinding::Dump(FILE *f) const {
   }
   std::fprintf(f, "    isArgDescriptorSet: 0x%x\n", isArgDescriptorSet_);
   std::fprintf(f, "    isTypeBound: %d\n", isTypeBound_);
-  std::fprintf(f, "    isArgContiguousSet: 0x%x\n", isArgContiguousSet_);
+  std::fprintf(f, "    specialCaseFlag 0x%x\n", specialCaseFlag_);
   std::fprintf(f, "    proc: %p\n", reinterpret_cast<void *>(proc_));
   return f;
 }
diff --git a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
index f1f931e87a86e..83aa37f8d06f3 100644
--- a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -72,3 +72,13 @@ TEST(AllocatableCUFTest, DescriptorAllocationTest) {
   EXPECT_TRUE(desc != nullptr);
   RTNAME(CUFFreeDescriptor)(desc);
 }
+
+TEST(AllocatableCUFTest, CUFSetAllocatorIndex) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  EXPECT_EQ((int)kDefaultAllocator, a->GetAllocIdx());
+  RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+}
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 72d12cd92600d..c167a55bc486d 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -1,9 +1,9 @@
-<!--===- docs/Extensions.md 
-  
+<!--===- docs/Extensions.md
+
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
+
 -->
 
 # Fortran Extensions supported by Flang
@@ -170,6 +170,18 @@ end
   In the case of `DEFERRED` bindings in an `ABSTRACT` derived type,
   however, overrides are necessary, so they are permitted for inaccessible
   bindings with an optional warning.
+* Main program name is allowed to be the same as the other symbols used
+  in the main program, for example:
+```
+module m
+end
+program m
+use m
+end
+```
+  Note that internally the main program symbol name is all uppercase, unlike
+  the names of all other symbols, which are usually all lowercase. This
+  may make a difference in testing/debugging.
 
 ## Extensions, deletions, and legacy features supported by default
 
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 0118f8eb7d913..f7da6c889d413 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -709,8 +709,9 @@ CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC
 MALLOC, FREE
 ```
 
-### Library subroutine
+### Library subroutines and functions
 ```
+ticks = MCLOCK()
 CALL BACKTRACE()
 CALL FDATE(TIME)
 CALL GETLOG(USRNAME)
diff --git a/flang/docs/ParallelMultiImageFortranRuntime.md b/flang/docs/ParallelMultiImageFortranRuntime.md
new file mode 100644
index 0000000000000..8cf0055e5817b
--- /dev/null
+++ b/flang/docs/ParallelMultiImageFortranRuntime.md
@@ -0,0 +1,18 @@
+<!--===- docs/ParallelMultiImageFortranRuntime.md
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
+# Multi-Image Parallel Fortran Runtime
+
+
+The Parallel Runtime Interface for Fortran (PRIF) defines an 
+interface designed for LLVM Flang to target implementations of 
+Fortran's multi-image parallel features.
+
+The current revision of the PRIF specification is here:
+<https://doi.org/10.25344/S4CG6G>
+
diff --git a/flang/docs/index.md b/flang/docs/index.md
index 2568ad70c5d09..016577bcb1e98 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -78,6 +78,7 @@ on how to get in touch with us and to learn more about the current status.
    OpenMP-semantics
    OptionComparison
    Overview
+   ParallelMultiImageFortranRuntime
    ParameterizedDerivedTypes
    ParserCombinators
    Parsing
diff --git a/flang/include/flang/Common/format.h b/flang/include/flang/Common/format.h
index 848aa9809e4b7..1e64acb823616 100644
--- a/flang/include/flang/Common/format.h
+++ b/flang/include/flang/Common/format.h
@@ -32,27 +32,33 @@ namespace Fortran::common {
 // AddOverflow and MulOverflow are copied from
 // llvm/include/llvm/Support/MathExtras.h and specialised to int64_t.
 
+// __has_builtin is not defined in some compilers. Make sure it is defined.
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
 /// Add two signed integers, computing the two's complement truncated result,
 /// returning true if overflow occurred.
-static inline bool AddOverflow(int64_t X, int64_t Y, int64_t &Result) {
+static inline bool AddOverflow(
+    std::int64_t x, std::int64_t y, std::int64_t &result) {
 #if __has_builtin(__builtin_add_overflow)
-  return __builtin_add_overflow(X, Y, &Result);
+  return __builtin_add_overflow(x, y, &result);
 #else
   // Perform the unsigned addition.
-  const uint64_t UX = static_cast<uint64_t>(X);
-  const uint64_t UY = static_cast<uint64_t>(Y);
-  const uint64_t UResult = UX + UY;
+  const std::uint64_t ux{static_cast<std::uint64_t>(x)};
+  const std::uint64_t uy{static_cast<std::uint64_t>(y)};
+  const std::uint64_t uresult{ux + uy};
 
   // Convert to signed.
-  Result = static_cast<int64_t>(UResult);
+  result = static_cast<std::int64_t>(uresult);
 
   // Adding two positive numbers should result in a positive number.
-  if (X > 0 && Y > 0) {
-    return Result <= 0;
+  if (x > 0 && y > 0) {
+    return result <= 0;
   }
   // Adding two negatives should result in a negative number.
-  if (X < 0 && Y < 0) {
-    return Result >= 0;
+  if (x < 0 && y < 0) {
+    return result >= 0;
   }
   return false;
 #endif
@@ -60,36 +66,39 @@ static inline bool AddOverflow(int64_t X, int64_t Y, int64_t &Result) {
 
 /// Multiply two signed integers, computing the two's complement truncated
 /// result, returning true if an overflow occurred.
-static inline bool MulOverflow(int64_t X, int64_t Y, int64_t &Result) {
+static inline bool MulOverflow(
+    std::int64_t x, std::int64_t y, std::int64_t &result) {
 #if __has_builtin(__builtin_mul_overflow)
-  return __builtin_mul_overflow(X, Y, &Result);
+  return __builtin_mul_overflow(x, y, &result);
 #else
   // Perform the unsigned multiplication on absolute values.
-  const uint64_t UX =
-      X < 0 ? (0 - static_cast<uint64_t>(X)) : static_cast<uint64_t>(X);
-  const uint64_t UY =
-      Y < 0 ? (0 - static_cast<uint64_t>(Y)) : static_cast<uint64_t>(Y);
-  const uint64_t UResult = UX * UY;
+  const std::uint64_t ux{x < 0 ? (0 - static_cast<std::uint64_t>(x))
+                               : static_cast<std::uint64_t>(x)};
+  const std::uint64_t uy{y < 0 ? (0 - static_cast<std::uint64_t>(y))
+                               : static_cast<std::uint64_t>(y)};
+  const std::uint64_t uresult{ux * uy};
 
   // Convert to signed.
-  const bool IsNegative = (X < 0) ^ (Y < 0);
-  Result = IsNegative ? (0 - UResult) : UResult;
+  const bool isNegative = (x < 0) ^ (y < 0);
+  result = isNegative ? (0 - uresult) : uresult;
 
   // If any of the args was 0, result is 0 and no overflow occurs.
-  if (UX == 0 || UY == 0) {
+  if (ux == 0 || uy == 0) {
     return false;
   }
 
-  // UX and UY are in [1, 2^n], where n is the number of digits.
+  // ux and uy are in [1, 2^n], where n is the number of digits.
   // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
   // positive) divided by an argument compares to the other.
-  if (IsNegative) {
-    return UX > (static_cast<uint64_t>(std::numeric_limits<int64_t>::max()) +
-                    uint64_t(1)) /
-        UY;
+  if (isNegative) {
+    return ux >
+        (static_cast<std::uint64_t>(std::numeric_limits<std::int64_t>::max()) +
+            std::uint64_t{1}) /
+        uy;
   } else {
-    return UX >
-        (static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) / UY;
+    return ux >
+        (static_cast<std::uint64_t>(std::numeric_limits<std::int64_t>::max())) /
+        uy;
   }
 #endif
 }
@@ -206,10 +215,10 @@ template <typename CHAR = char> class FormatValidator {
   Token token_{}; // current token
   Token knrToken_{}; // k, n, or r UnsignedInteger token
   Token scaleFactorToken_{}; // most recent scale factor token P
-  int64_t integerValue_{-1}; // value of UnsignedInteger token
-  int64_t knrValue_{-1}; // -1 ==> not present
-  int64_t scaleFactorValue_{}; // signed k in kP
-  int64_t wValue_{-1};
+  std::int64_t integerValue_{-1}; // value of UnsignedInteger token
+  std::int64_t knrValue_{-1}; // -1 ==> not present
+  std::int64_t scaleFactorValue_{}; // signed k in kP
+  std::int64_t wValue_{-1};
   char argString_[3]{}; // 1-2 character msg arg; usually edit descriptor name
   bool formatHasErrors_{false};
   bool unterminatedFormatError_{false};
@@ -280,18 +289,18 @@ template <typename CHAR> void FormatValidator<CHAR>::NextToken() {
   case '7':
   case '8':
   case '9': {
-    const CHAR *lastCursor;
+    const CHAR *lastCursor{};
     integerValue_ = 0;
     bool overflow{false};
     do {
       lastCursor = cursor_;
       if (!overflow) {
-        overflow =
-            MulOverflow(static_cast<int64_t>(10), integerValue_, integerValue_);
+        overflow = MulOverflow(
+            static_cast<std::int64_t>(10), integerValue_, integerValue_);
       }
       if (!overflow) {
         overflow = AddOverflow(
-            integerValue_, static_cast<int64_t>(c - '0'), integerValue_);
+            integerValue_, static_cast<std::int64_t>(c - '0'), integerValue_);
       }
       c = NextChar();
     } while (c >= '0' && c <= '9');
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index d38c5b6d09a82..acdba7c49e6b3 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -241,11 +241,11 @@ struct IntrinsicLibrary {
   void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genClock64(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <mlir::arith::CmpIPredicate pred>
   fir::ExtendedValue genCPtrCompare(mlir::Type,
                                     llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genCosd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genCospi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genDim(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genDotProduct(mlir::Type,
@@ -376,6 +376,8 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genNorm2(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genNot(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  template <typename OpTy>
+  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genPerror(llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
index bdeb7574012c6..43dca65322a62 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
@@ -31,6 +31,10 @@ void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
 void genDescriptorCheckSection(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value desc);
 
+/// Generate runtime call to set the allocator index in the descriptor.
+void genSetAllocatorIndex(fir::FirOpBuilder &builder, mlir::Location loc,
+                          mlir::Value desc, mlir::Value index);
+
 } // namespace fir::runtime::cuda
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index e38738230ffbc..23ab882615209 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -388,4 +388,25 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
   let hasVerifier = 1;
 }
 
+def cuf_SetAllocatorIndexOp : cuf_Op<"set_allocator_idx", []> {
+  let summary = "Set the allocator index in a descriptor";
+
+  let description = [{
+    Allocator index in the Fortran descriptor is used to retrived the correct
+    CUDA allocator to allocate the memory on the device.
+    In many cases the allocator index is set when the descriptor is created. For
+    device components, the descriptor is part of the derived-type itself and
+    needs to be set after the derived-type is allocated in managed memory.
+  }];
+
+  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
+      cuf_DataAttributeAttr:$data_attr);
+
+  let assemblyFormat = [{
+    $box `:` qualified(type($box)) attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif // FORTRAN_DIALECT_CUF_CUF_OPS
diff --git a/flang/include/flang/Parser/message.h b/flang/include/flang/Parser/message.h
index db1a0a65157e3..9192d23529913 100644
--- a/flang/include/flang/Parser/message.h
+++ b/flang/include/flang/Parser/message.h
@@ -355,9 +355,9 @@ class Messages {
   void Emit(llvm::raw_ostream &, const AllCookedSources &,
       bool echoSourceLines = true,
       const common::LanguageFeatureControl *hintFlags = nullptr,
-      std::size_t maxErrorsToEmit = 0) const;
+      std::size_t maxErrorsToEmit = 0, bool warningsAreErrors = false) const;
   void AttachTo(Message &, std::optional<Severity> = std::nullopt);
-  bool AnyFatalError() const;
+  bool AnyFatalError(bool warningsAreErrors = false) const;
 
 private:
   std::list<Message> messages_;
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index ab2dde7d5dfbe..cc1d032f94d4a 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3481,6 +3481,9 @@ struct OmpDirectiveName {
   // This allows "construct<OmpDirectiveName>(Verbatim("<name>"))".
   OmpDirectiveName(const Verbatim &name);
   using WrapperTrait = std::true_type;
+
+  bool IsExecutionPart() const; // Is allowed in the execution part
+
   CharBlock source;
   llvm::omp::Directive v{llvm::omp::Directive::OMPD_unknown};
 };
@@ -5028,6 +5031,13 @@ struct OpenMPLoopConstruct {
       t;
 };
 
+// Lookahead class to identify execution-part OpenMP constructs without
+// parsing the entire OpenMP construct.
+struct OpenMPExecDirective {
+  WRAPPER_CLASS_BOILERPLATE(OpenMPExecDirective, OmpDirectiveName);
+  CharBlock source;
+};
+
 struct OpenMPConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPConstruct);
   std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct,
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index 06e4a4649db1b..7555f276ac1de 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -41,6 +41,10 @@ void RTDECL(CUFSyncGlobalDescriptor)(
 void RTDECL(CUFDescriptorCheckSection)(
     const Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Set the allocator index with the provided value.
+void RTDECL(CUFSetAllocatorIndex)(Descriptor *, int index,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index 06ae7f35d9b5b..b350204714431 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -12,14 +12,12 @@
 #ifndef FORTRAN_RUNTIME_EXTENSIONS_H_
 #define FORTRAN_RUNTIME_EXTENSIONS_H_
 
-#include "flang/Runtime/entry-names.h"
-
-#define FORTRAN_PROCEDURE_NAME(name) name##_
-
 #include "flang/Runtime/entry-names.h"
 #include <cstddef>
 #include <cstdint>
 
+#define FORTRAN_PROCEDURE_NAME(name) name##_
+
 #ifdef _WIN32
 // UID and GID don't exist on Windows, these exist to avoid errors.
 typedef std::uint32_t uid_t;
@@ -89,5 +87,8 @@ int FORTRAN_PROCEDURE_NAME(ierrno)();
 // GNU extension subroutine PERROR(STRING)
 void RTNAME(Perror)(const char *str);
 
+// MCLOCK -- returns accumulated time in ticks
+int FORTRAN_PROCEDURE_NAME(mclock)();
+
 } // extern "C"
 #endif // FORTRAN_RUNTIME_EXTENSIONS_H_
diff --git a/flang/include/flang/Semantics/runtime-type-info.h b/flang/include/flang/Semantics/runtime-type-info.h
index 6c5a061d1c1a2..94e8293b14643 100644
--- a/flang/include/flang/Semantics/runtime-type-info.h
+++ b/flang/include/flang/Semantics/runtime-type-info.h
@@ -52,10 +52,15 @@ constexpr char procCompName[]{"proc"};
 
 SymbolVector CollectBindings(const Scope &dtScope);
 
+enum NonTbpDefinedIoFlags {
+  IsDtvArgPolymorphic = 1 << 0,
+  DefinedIoInteger8 = 1 << 1,
+};
+
 struct NonTbpDefinedIo {
   const Symbol *subroutine;
   common::DefinedIo definedIo;
-  bool isDtvArgPolymorphic;
+  std::uint8_t flags;
 };
 
 std::multimap<const Symbol *, NonTbpDefinedIo>
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index fb670528f3ce4..317b9357b4c1f 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -199,6 +199,8 @@ bool IsPolymorphic(const Symbol &);
 bool IsUnlimitedPolymorphic(const Symbol &);
 bool IsPolymorphicAllocatable(const Symbol &);
 
+bool IsDeviceAllocatable(const Symbol &symbol);
+
 inline bool IsCUDADeviceContext(const Scope *scope) {
   if (scope) {
     if (const Symbol * symbol{scope->symbol()}) {
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 4773e136c41cb..9957010684d48 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -428,6 +428,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"conjg", {{"z", SameComplex}}, SameComplex},
     {"cos", {{"x", SameFloating}}, SameFloating},
     {"cosd", {{"x", SameFloating}}, SameFloating},
+    {"cospi", {{"x", SameFloating}}, SameFloating},
     {"cosh", {{"x", SameFloating}}, SameFloating},
     {"coshape", {{"coarray", AnyData, Rank::coarray}, SizeDefaultKIND}, KINDInt,
         Rank::vector, IntrinsicClass::inquiryFunction},
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 2429e07e5b8c4..58901c6000380 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -230,15 +230,14 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
   const common::LanguageFeatureControl &features{
       instance->getInvocation().getFortranOpts().features};
   const size_t maxErrors{instance->getInvocation().getMaxErrors()};
-  if (!instance->getParsing().messages().empty() &&
-      (instance->getInvocation().getWarnAsErr() ||
-       instance->getParsing().messages().AnyFatalError())) {
+  const bool warningsAreErrors{instance->getInvocation().getWarnAsErr()};
+  if (instance->getParsing().messages().AnyFatalError(warningsAreErrors)) {
     const unsigned diagID = instance->getDiagnostics().getCustomDiagID(
         clang::DiagnosticsEngine::Error, message);
     instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
     instance->getParsing().messages().Emit(
         llvm::errs(), instance->getAllCookedSources(),
-        /*echoSourceLines=*/true, &features, maxErrors);
+        /*echoSourceLines=*/true, &features, maxErrors, warningsAreErrors);
     return true;
   }
   if (instance->getParsing().parseTree().has_value() &&
@@ -249,7 +248,7 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
     instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
     instance->getParsing().messages().Emit(
         llvm::errs(), instance->getAllCookedSources(),
-        /*echoSourceLine=*/true, &features, maxErrors);
+        /*echoSourceLine=*/true, &features, maxErrors, warningsAreErrors);
     instance->getParsing().EmitMessage(
         llvm::errs(), instance->getParsing().finalRestingPlace(),
         "parser FAIL (final position)", "error: ", llvm::raw_ostream::RED);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 33c1f1e7a3c3a..4241d12601242 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -810,11 +810,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             fir::ExtendedValue read = fir::factory::genMutableBoxRead(
                 *builder, loc, box, /*mayBePolymorphic=*/false);
             if (auto read_arr_box = read.getBoxOf<fir::ArrayBoxValue>()) {
-              fir::factory::genInlinedAllocation(
-                  *builder, loc, *new_box, read_arr_box->getLBounds(),
-                  read_arr_box->getExtents(),
-                  /*lenParams=*/std::nullopt, name,
-                  /*mustBeHeap=*/true);
+              fir::factory::genInlinedAllocation(*builder, loc, *new_box,
+                                                 read_arr_box->getLBounds(),
+                                                 read_arr_box->getExtents(),
+                                                 /*lenParams=*/{}, name,
+                                                 /*mustBeHeap=*/true);
             } else if (auto read_char_arr_box =
                            read.getBoxOf<fir::CharArrayBoxValue>()) {
               fir::factory::genInlinedAllocation(
@@ -825,8 +825,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             } else if (auto read_char_box =
                            read.getBoxOf<fir::CharBoxValue>()) {
               fir::factory::genInlinedAllocation(*builder, loc, *new_box,
-                                                 /*lbounds=*/std::nullopt,
-                                                 /*extents=*/std::nullopt,
+                                                 /*lbounds=*/{},
+                                                 /*extents=*/{},
                                                  read_char_box->getLen(), name,
                                                  /*mustBeHeap=*/true);
             } else {
@@ -4590,8 +4590,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     // the static type of the LHS.
     if (Fortran::evaluate::UnwrapExpr<Fortran::evaluate::NullPointer>(
             assign.rhs))
-      return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType,
-                                                std::nullopt);
+      return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, {});
     hlfir::Entity rhs = Fortran::lower::convertExprToHLFIR(
         loc, *this, assign.rhs, localSymbols, rhsContext);
     // Create pointer descriptor value from the RHS.
@@ -5199,7 +5198,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                               "LEN parameters");
                   lhsRealloc = fir::factory::genReallocIfNeeded(
                       *builder, loc, *lhsMutableBox,
-                      /*shape=*/std::nullopt, lengthParams);
+                      /*shape=*/{}, lengthParams);
                   return lhsRealloc->newValue;
                 }
                 return genExprAddr(assign.lhs, stmtCtx);
@@ -5271,7 +5270,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
               if (lhsIsWholeAllocatable) {
                 assert(lhsRealloc.has_value());
                 fir::factory::finalizeRealloc(*builder, loc, *lhsMutableBox,
-                                              /*lbounds=*/std::nullopt,
+                                              /*lbounds=*/{},
                                               /*takeLboundsIfRealloc=*/false,
                                               *lhsRealloc);
               }
@@ -6059,8 +6058,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::func::FuncOp func = fir::FirOpBuilder::createFunction(
         mlir::UnknownLoc::get(context), getModuleOp(),
         fir::NameUniquer::doGenerated("Sham"),
-        mlir::FunctionType::get(context, std::nullopt, std::nullopt),
-        symbolTable);
+        mlir::FunctionType::get(context, {}, {}), symbolTable);
     func.addEntryBlock();
     CHECK(!builder && "Expected builder to be uninitialized");
     builder = new fir::FirOpBuilder(func, bridge.getKindMap(), symbolTable);
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 6ed15df0de754..071513303da25 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -496,8 +496,7 @@ Fortran::lower::genCallOpAndResult(
     auto *context = builder.getContext();
     if (mlir::isa<fir::BoxProcType>(snd) &&
         mlir::isa<mlir::FunctionType>(fst.getType())) {
-      auto funcTy =
-          mlir::FunctionType::get(context, std::nullopt, std::nullopt);
+      auto funcTy = mlir::FunctionType::get(context, {}, {});
       auto boxProcTy = builder.getBoxProcType(funcTy);
       if (mlir::Value host = argumentHostAssocs(converter, fst)) {
         cast = builder.create<fir::EmboxProcOp>(
@@ -1714,7 +1713,7 @@ void prepareUserCallArguments(
                                     /*nonDeferredParams=*/mlir::ValueRange{},
                                     /*mutableProperties=*/{});
         fir::factory::associateMutableBox(builder, loc, ptrBox, actualExv,
-                                          /*lbounds=*/std::nullopt);
+                                          /*lbounds=*/{});
         caller.placeInput(arg, irBox);
         continue;
       }
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index 1051d50ce8a9a..1850b67898126 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -374,8 +374,8 @@ static mlir::Value genStructureComponentInit(
                                "allocatable component value that is not NULL");
     } else {
       // Handle NULL() initialization
-      mlir::Value componentValue{fir::factory::createUnallocatedBox(
-          builder, loc, componentTy, std::nullopt)};
+      mlir::Value componentValue{
+          fir::factory::createUnallocatedBox(builder, loc, componentTy, {})};
       componentValue = builder.createConvert(loc, componentTy, componentValue);
 
       return builder.create<fir::InsertValueOp>(
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index f3430bfa3021e..0a1cd67789772 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -596,7 +596,7 @@ absentBoxToUnallocatedBox(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Type boxType = box.getType();
   assert(mlir::isa<fir::BoxType>(boxType) && "argument must be a fir.box");
   mlir::Value emptyBox =
-      fir::factory::createUnallocatedBox(builder, loc, boxType, std::nullopt);
+      fir::factory::createUnallocatedBox(builder, loc, boxType, {});
   auto safeToReadBox =
       builder.create<mlir::arith::SelectOp>(loc, isPresent, box, emptyBox);
   return fir::substBase(exv, safeToReadBox);
@@ -2663,8 +2663,7 @@ class ScalarExprLowering {
                                        /*nonDeferredParams=*/mlir::ValueRange{},
                                        /*mutableProperties=*/{});
           Fortran::lower::associateMutableBox(converter, loc, pointer, *expr,
-                                              /*lbounds=*/std::nullopt,
-                                              stmtCtx);
+                                              /*lbounds=*/{}, stmtCtx);
           caller.placeInput(arg, irBox);
           continue;
         }
@@ -6186,7 +6185,7 @@ class ArrayExprLowering {
   mlir::FunctionType memcpyType() {
     auto ptrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
     llvm::SmallVector<mlir::Type> args = {ptrTy, ptrTy, builder.getI64Type()};
-    return mlir::FunctionType::get(builder.getContext(), args, std::nullopt);
+    return mlir::FunctionType::get(builder.getContext(), args, {});
   }
 
   /// Create a call to the LLVM memcpy intrinsic.
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 9689f920840fb..7de433d6a201a 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -1945,7 +1945,7 @@ class HlfirBuilder {
           fir::emitFatalError(loc, "pointer component designator could not be "
                                    "lowered to mutable box");
         Fortran::lower::associateMutableBox(converter, loc, *toBox, expr,
-                                            /*lbounds=*/std::nullopt, stmtCtx);
+                                            /*lbounds=*/{}, stmtCtx);
         continue;
       }
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 44f534e7d569a..6cda742874ccf 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -236,9 +236,8 @@ mlir::Value Fortran::lower::genInitialDataTarget(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   if (Fortran::evaluate::UnwrapExpr<Fortran::evaluate::NullPointer>(
           initialTarget))
-    return fir::factory::createUnallocatedBox(
-        builder, loc, boxType,
-        /*nonDeferredParams=*/std::nullopt);
+    return fir::factory::createUnallocatedBox(builder, loc, boxType,
+                                              /*nonDeferredParams=*/{});
   // Pointer initial data target, and NULL(mold).
   for (const auto &sym : Fortran::evaluate::CollectSymbols(initialTarget)) {
     // Derived type component symbols should not be instantiated as objects
@@ -354,8 +353,8 @@ static mlir::Value genComponentDefaultInit(
       // From a standard point of view, pointer without initialization do not
       // need to be disassociated, but for sanity and simplicity, do it in
       // global constructor since this has no runtime cost.
-      componentValue = fir::factory::createUnallocatedBox(
-          builder, loc, componentTy, std::nullopt);
+      componentValue =
+          fir::factory::createUnallocatedBox(builder, loc, componentTy, {});
     } else if (Fortran::lower::hasDefaultInitialization(component)) {
       // Component type has default initialization.
       componentValue = genDefaultInitializerValue(converter, loc, component,
@@ -554,7 +553,7 @@ fir::GlobalOp Fortran::lower::defineGlobal(
       createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
         mlir::Value box = fir::factory::createUnallocatedBox(
             b, loc, symTy,
-            /*nonDeferredParams=*/std::nullopt,
+            /*nonDeferredParams=*/{},
             /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
         b.create<fir::HasValueOp>(loc, box);
       });
@@ -787,6 +786,98 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
   return res;
 }
 
+/// Device allocatable components in a derived-type don't have the correct
+/// allocator index in their descriptor when they are created. After
+/// initialization, cuf.set_allocator_idx operations are inserted to set the
+/// correct allocator index for each device component.
+static void
+initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter,
+                                   const Fortran::semantics::Symbol &symbol,
+                                   Fortran::lower::SymMap &symMap) {
+  if (const auto *details{
+          symbol.GetUltimate()
+              .detailsIf<Fortran::semantics::ObjectEntityDetails>()}) {
+    const Fortran::semantics::DeclTypeSpec *type{details->type()};
+    const Fortran::semantics::DerivedTypeSpec *derived{type ? type->AsDerived()
+                                                            : nullptr};
+    if (derived) {
+      if (!FindCUDADeviceAllocatableUltimateComponent(*derived))
+        return; // No device components.
+
+      fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+      mlir::Location loc = converter.getCurrentLocation();
+
+      fir::ExtendedValue exv =
+          converter.getSymbolExtendedValue(symbol.GetUltimate(), &symMap);
+      mlir::Type baseTy = fir::unwrapRefType(fir::getBase(exv).getType());
+      if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseTy))
+        baseTy = boxTy.getEleTy();
+      baseTy = fir::unwrapRefType(baseTy);
+      auto recTy =
+          mlir::dyn_cast<fir::RecordType>(fir::unwrapSequenceType(baseTy));
+      assert(recTy && "expected fir::RecordType");
+
+      llvm::SmallVector<mlir::Value> coordinates;
+      Fortran::semantics::UltimateComponentIterator components{*derived};
+      for (const auto &sym : components) {
+        if (Fortran::semantics::IsDeviceAllocatable(sym)) {
+          unsigned fieldIdx = recTy.getFieldIndex(sym.name().ToString());
+          mlir::Type fieldTy;
+          std::vector<mlir::Value> coordinates;
+
+          if (fieldIdx != std::numeric_limits<unsigned>::max()) {
+            // Field found in the base record type.
+            auto fieldName = recTy.getTypeList()[fieldIdx].first;
+            fieldTy = recTy.getTypeList()[fieldIdx].second;
+            mlir::Value fieldIndex = builder.create<fir::FieldIndexOp>(
+                loc, fir::FieldType::get(fieldTy.getContext()), fieldName,
+                recTy,
+                /*typeParams=*/mlir::ValueRange{});
+            coordinates.push_back(fieldIndex);
+          } else {
+            // Field not found in base record type, search in potential
+            // record type components.
+            for (auto component : recTy.getTypeList()) {
+              if (auto childRecTy =
+                      mlir::dyn_cast<fir::RecordType>(component.second)) {
+                fieldIdx = childRecTy.getFieldIndex(sym.name().ToString());
+                if (fieldIdx != std::numeric_limits<unsigned>::max()) {
+                  mlir::Value parentFieldIndex =
+                      builder.create<fir::FieldIndexOp>(
+                          loc, fir::FieldType::get(childRecTy.getContext()),
+                          component.first, recTy,
+                          /*typeParams=*/mlir::ValueRange{});
+                  coordinates.push_back(parentFieldIndex);
+                  auto fieldName = childRecTy.getTypeList()[fieldIdx].first;
+                  fieldTy = childRecTy.getTypeList()[fieldIdx].second;
+                  mlir::Value childFieldIndex =
+                      builder.create<fir::FieldIndexOp>(
+                          loc, fir::FieldType::get(fieldTy.getContext()),
+                          fieldName, childRecTy,
+                          /*typeParams=*/mlir::ValueRange{});
+                  coordinates.push_back(childFieldIndex);
+                  break;
+                }
+              }
+            }
+          }
+
+          if (coordinates.empty())
+            TODO(loc, "device resident component in complex derived-type "
+                      "hierarchy");
+
+          mlir::Value comp = builder.create<fir::CoordinateOp>(
+              loc, builder.getRefType(fieldTy), fir::getBase(exv), coordinates);
+          cuf::DataAttributeAttr dataAttr =
+              Fortran::lower::translateSymbolCUFDataAttribute(
+                  builder.getContext(), sym);
+          builder.create<cuf::SetAllocatorIndexOp>(loc, comp, dataAttr);
+        }
+      }
+    }
+  }
+}
+
 /// Must \p var be default initialized at runtime when entering its scope.
 static bool
 mustBeDefaultInitializedAtRuntime(const Fortran::lower::pft::Variable &var) {
@@ -1109,6 +1200,9 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
   if (mustBeDefaultInitializedAtRuntime(var))
     Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
                                                symMap);
+  if (converter.getFoldingContext().languageFeatures().IsEnabled(
+          Fortran::common::LanguageFeature::CUDA))
+    initializeDeviceComponentAllocator(converter, var.getSymbol(), symMap);
   auto *builder = &converter.getFirOpBuilder();
   if (needCUDAAlloc(var.getSymbol()) &&
       !cuf::isCUDADeviceContext(builder->getRegion())) {
@@ -1367,6 +1461,9 @@ static void instantiateAlias(Fortran::lower::AbstractConverter &converter,
   if (mustBeDefaultInitializedAtRuntime(var))
     Fortran::lower::defaultInitializeAtRuntime(converter, var.getSymbol(),
                                                symMap);
+  if (converter.getFoldingContext().languageFeatures().IsEnabled(
+          Fortran::common::LanguageFeature::CUDA))
+    initializeDeviceComponentAllocator(converter, var.getSymbol(), symMap);
 }
 
 //===--------------------------------------------------------------===//
diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index 6a44be65a6cde..95ea74b791b47 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -410,15 +410,15 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
           .genThen([&]() {
             fir::factory::associateMutableBox(builder, loc, boxInTuple,
                                               args.hostValue,
-                                              /*lbounds=*/std::nullopt);
+                                              /*lbounds=*/{});
           })
           .genElse([&]() {
             fir::factory::disassociateMutableBox(builder, loc, boxInTuple);
           })
           .end();
     } else {
-      fir::factory::associateMutableBox(
-          builder, loc, boxInTuple, args.hostValue, /*lbounds=*/std::nullopt);
+      fir::factory::associateMutableBox(builder, loc, boxInTuple,
+                                        args.hostValue, /*lbounds=*/{});
     }
   }
 
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index 63a612d7ead61..53bf61922392d 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -269,10 +269,12 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
   mlir::Type sizeTy =
       fir::runtime::getModel<std::size_t>()(builder.getContext());
   mlir::Type intTy = fir::runtime::getModel<int>()(builder.getContext());
+  mlir::Type byteTy =
+      fir::runtime::getModel<std::uint8_t>()(builder.getContext());
   mlir::Type boolTy = fir::runtime::getModel<bool>()(builder.getContext());
   mlir::Type listTy = fir::SequenceType::get(
       definedIoProcMap.size(),
-      mlir::TupleType::get(context, {refTy, refTy, intTy, boolTy}));
+      mlir::TupleType::get(context, {refTy, refTy, intTy, byteTy}));
   mlir::Type tableTy = mlir::TupleType::get(
       context, {sizeTy, fir::ReferenceType::get(listTy), boolTy});
 
@@ -339,9 +341,9 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter,
       insert(builder.createIntegerConstant(
           loc, intTy, static_cast<int>(iface.second.definedIo)));
       // polymorphic flag is set if first defined IO dummy arg is CLASS(T)
+      // defaultInt8 flag is set if -fdefined-integer-8
       // [bool isDtvArgPolymorphic]
-      insert(builder.createIntegerConstant(loc, boolTy,
-                                           iface.second.isDtvArgPolymorphic));
+      insert(builder.createIntegerConstant(loc, byteTy, iface.second.flags));
     }
     if (tableIsLocal)
       builder.create<fir::StoreOp>(loc, list, listAddr);
@@ -526,7 +528,7 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
         descAddr = builder.createTemporary(loc, boxType);
         fir::MutableBoxValue box = fir::MutableBoxValue(descAddr, {}, {});
         fir::factory::associateMutableBox(builder, loc, box, exv,
-                                          /*lbounds=*/std::nullopt);
+                                          /*lbounds=*/{});
       }
       descAddr = builder.createConvert(loc, descRefTy, descAddr);
       list = builder.create<fir::InsertValueOp>(loc, listTy, list, descAddr,
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 39e4444cde4e3..51eb33dec186b 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -708,6 +708,7 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
                          bool setDeclareAttr = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
+  const bool unwrapBoxAddr = true;
   for (const auto &accObject : objectList.v) {
     llvm::SmallVector<mlir::Value> bounds;
     std::stringstream asFortran;
@@ -735,8 +736,25 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
     Op op = createDataEntryOp<Op>(
         builder, operandLocation, baseAddr, asFortran, bounds, structured,
         implicit, dataClause, baseAddr.getType(), async, asyncDeviceTypes,
-        asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true, info.isPresent);
+        asyncOnlyDeviceTypes, unwrapBoxAddr, info.isPresent);
     dataOperands.push_back(op.getAccVar());
+
+    // For UseDeviceOp, if operand is one of a pair resulting from a
+    // declare operation, create a UseDeviceOp for the other operand as well.
+    if constexpr (std::is_same_v<Op, mlir::acc::UseDeviceOp>) {
+      if (auto declareOp =
+              mlir::dyn_cast<hlfir::DeclareOp>(baseAddr.getDefiningOp())) {
+        mlir::Value otherAddr = declareOp.getResult(1);
+        if (baseAddr != otherAddr) {
+          Op op = createDataEntryOp<Op>(builder, operandLocation, otherAddr,
+                                        asFortran, bounds, structured, implicit,
+                                        dataClause, otherAddr.getType(), async,
+                                        asyncDeviceTypes, asyncOnlyDeviceTypes,
+                                        unwrapBoxAddr, info.isPresent);
+          dataOperands.push_back(op.getAccVar());
+        }
+      }
+    }
   }
 }
 
@@ -4396,10 +4414,34 @@ getAttributeValueByDeviceType(llvm::SmallVector<mlir::Attribute> &attributes,
   return std::nullopt;
 }
 
+// Helper function to extract string value from bind name variant
+static std::optional<llvm::StringRef> getBindNameStringValue(
+    const std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
+        &bindNameValue) {
+  if (!bindNameValue.has_value())
+    return std::nullopt;
+
+  return std::visit(
+      [](const auto &attr) -> std::optional<llvm::StringRef> {
+        if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                     mlir::StringAttr>) {
+          return attr.getValue();
+        } else if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                            mlir::SymbolRefAttr>) {
+          return attr.getLeafReference();
+        } else {
+          return std::nullopt;
+        }
+      },
+      bindNameValue.value());
+}
+
 static bool compareDeviceTypeInfo(
     mlir::acc::RoutineOp op,
-    llvm::SmallVector<mlir::Attribute> &bindNameArrayAttr,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypeArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangDimArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypeArrayAttr,
@@ -4409,9 +4451,13 @@ static bool compareDeviceTypeInfo(
   for (uint32_t dtypeInt = 0;
        dtypeInt != mlir::acc::getMaxEnumValForDeviceType(); ++dtypeInt) {
     auto dtype = static_cast<mlir::acc::DeviceType>(dtypeInt);
-    if (op.getBindNameValue(dtype) !=
-        getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
-            bindNameArrayAttr, bindNameDeviceTypeArrayAttr, dtype))
+    auto bindNameValue = getBindNameStringValue(op.getBindNameValue(dtype));
+    if (bindNameValue !=
+            getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
+                bindIdNameArrayAttr, bindIdNameDeviceTypeArrayAttr, dtype) &&
+        bindNameValue !=
+            getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
+                bindStrNameArrayAttr, bindStrNameDeviceTypeArrayAttr, dtype))
       return false;
     if (op.hasGang(dtype) != hasDeviceType(gangArrayAttr, dtype))
       return false;
@@ -4458,8 +4504,10 @@ getArrayAttrOrNull(fir::FirOpBuilder &builder,
 void createOpenACCRoutineConstruct(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     mlir::ModuleOp mod, mlir::func::FuncOp funcOp, std::string funcName,
-    bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindNames,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes,
+    bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindIdNames,
+    llvm::SmallVector<mlir::Attribute> &bindStrNames,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDimValues,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes,
@@ -4472,7 +4520,8 @@ void createOpenACCRoutineConstruct(
         0) {
       // If the routine is already specified with the same clauses, just skip
       // the operation creation.
-      if (compareDeviceTypeInfo(routineOp, bindNames, bindNameDeviceTypes,
+      if (compareDeviceTypeInfo(routineOp, bindIdNames, bindStrNames,
+                                bindIdNameDeviceTypes, bindStrNameDeviceTypes,
                                 gangDeviceTypes, gangDimValues,
                                 gangDimDeviceTypes, seqDeviceTypes,
                                 workerDeviceTypes, vectorDeviceTypes) &&
@@ -4489,8 +4538,10 @@ void createOpenACCRoutineConstruct(
   modBuilder.create<mlir::acc::RoutineOp>(
       loc, routineOpStr,
       mlir::SymbolRefAttr::get(builder.getContext(), funcName),
-      getArrayAttrOrNull(builder, bindNames),
-      getArrayAttrOrNull(builder, bindNameDeviceTypes),
+      getArrayAttrOrNull(builder, bindIdNames),
+      getArrayAttrOrNull(builder, bindStrNames),
+      getArrayAttrOrNull(builder, bindIdNameDeviceTypes),
+      getArrayAttrOrNull(builder, bindStrNameDeviceTypes),
       getArrayAttrOrNull(builder, workerDeviceTypes),
       getArrayAttrOrNull(builder, vectorDeviceTypes),
       getArrayAttrOrNull(builder, seqDeviceTypes), hasNohost,
@@ -4507,8 +4558,10 @@ static void interpretRoutineDeviceInfo(
     llvm::SmallVector<mlir::Attribute> &seqDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &vectorDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &workerDeviceTypes,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes,
-    llvm::SmallVector<mlir::Attribute> &bindNames,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindIdNames,
+    llvm::SmallVector<mlir::Attribute> &bindStrNames,
     llvm::SmallVector<mlir::Attribute> &gangDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDimValues,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes) {
@@ -4541,16 +4594,18 @@ static void interpretRoutineDeviceInfo(
   if (dinfo.bindNameOpt().has_value()) {
     const auto &bindName = dinfo.bindNameOpt().value();
     mlir::Attribute bindNameAttr;
-    if (const auto &bindStr{std::get_if<std::string>(&bindName)}) {
+    if (const auto &bindSym{
+            std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) {
+      bindNameAttr = builder.getSymbolRefAttr(converter.mangleName(*bindSym));
+      bindIdNames.push_back(bindNameAttr);
+      bindIdNameDeviceTypes.push_back(getDeviceTypeAttr());
+    } else if (const auto &bindStr{std::get_if<std::string>(&bindName)}) {
       bindNameAttr = builder.getStringAttr(*bindStr);
-    } else if (const auto &bindSym{
-                   std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) {
-      bindNameAttr = builder.getStringAttr(converter.mangleName(*bindSym));
+      bindStrNames.push_back(bindNameAttr);
+      bindStrNameDeviceTypes.push_back(getDeviceTypeAttr());
     } else {
       llvm_unreachable("Unsupported bind name type");
     }
-    bindNames.push_back(bindNameAttr);
-    bindNameDeviceTypes.push_back(getDeviceTypeAttr());
   }
 }
 
@@ -4566,8 +4621,9 @@ void Fortran::lower::genOpenACCRoutineConstruct(
   bool hasNohost{false};
 
   llvm::SmallVector<mlir::Attribute> seqDeviceTypes, vectorDeviceTypes,
-      workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes,
-      gangDimDeviceTypes, gangDimValues;
+      workerDeviceTypes, bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+      bindIdNames, bindStrNames, gangDeviceTypes, gangDimDeviceTypes,
+      gangDimValues;
 
   for (const Fortran::semantics::OpenACCRoutineInfo &info : routineInfos) {
     // Device Independent Attributes
@@ -4576,24 +4632,26 @@ void Fortran::lower::genOpenACCRoutineConstruct(
     }
     // Note: Device Independent Attributes are set to the
     // none device type in `info`.
-    interpretRoutineDeviceInfo(converter, info, seqDeviceTypes,
-                               vectorDeviceTypes, workerDeviceTypes,
-                               bindNameDeviceTypes, bindNames, gangDeviceTypes,
-                               gangDimValues, gangDimDeviceTypes);
+    interpretRoutineDeviceInfo(
+        converter, info, seqDeviceTypes, vectorDeviceTypes, workerDeviceTypes,
+        bindIdNameDeviceTypes, bindStrNameDeviceTypes, bindIdNames,
+        bindStrNames, gangDeviceTypes, gangDimValues, gangDimDeviceTypes);
 
     // Device Dependent Attributes
     for (const Fortran::semantics::OpenACCRoutineDeviceTypeInfo &dinfo :
          info.deviceTypeInfos()) {
-      interpretRoutineDeviceInfo(
-          converter, dinfo, seqDeviceTypes, vectorDeviceTypes,
-          workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes,
-          gangDimValues, gangDimDeviceTypes);
+      interpretRoutineDeviceInfo(converter, dinfo, seqDeviceTypes,
+                                 vectorDeviceTypes, workerDeviceTypes,
+                                 bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+                                 bindIdNames, bindStrNames, gangDeviceTypes,
+                                 gangDimValues, gangDimDeviceTypes);
     }
   }
   createOpenACCRoutineConstruct(
-      converter, loc, mod, funcOp, funcName, hasNohost, bindNames,
-      bindNameDeviceTypes, gangDeviceTypes, gangDimValues, gangDimDeviceTypes,
-      seqDeviceTypes, workerDeviceTypes, vectorDeviceTypes);
+      converter, loc, mod, funcOp, funcName, hasNohost, bindIdNames,
+      bindStrNames, bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+      gangDeviceTypes, gangDimValues, gangDimDeviceTypes, seqDeviceTypes,
+      workerDeviceTypes, vectorDeviceTypes);
 }
 
 static void
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 3fae3f3a0ddfd..675a58e4f35a1 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -26,6 +26,8 @@
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Semantics/attr.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallSet.h"
 
 namespace Fortran {
 namespace lower {
@@ -49,7 +51,7 @@ DataSharingProcessor::DataSharingProcessor(
       firOpBuilder(converter.getFirOpBuilder()), clauses(clauses), eval(eval),
       shouldCollectPreDeterminedSymbols(shouldCollectPreDeterminedSymbols),
       useDelayedPrivatization(useDelayedPrivatization), symTable(symTable),
-      visitor() {
+      visitor(semaCtx) {
   eval.visit([&](const auto &functionParserNode) {
     parser::Walk(functionParserNode, visitor);
   });
@@ -424,24 +426,55 @@ getSource(const semantics::SemanticsContext &semaCtx,
   return source;
 }
 
+static void collectPrivatizingConstructs(
+    llvm::SmallSet<llvm::omp::Directive, 16> &constructs, unsigned version) {
+  using Clause = llvm::omp::Clause;
+  using Directive = llvm::omp::Directive;
+
+  static const Clause privatizingClauses[] = {
+      Clause::OMPC_private,
+      Clause::OMPC_lastprivate,
+      Clause::OMPC_firstprivate,
+      Clause::OMPC_in_reduction,
+      Clause::OMPC_reduction,
+      Clause::OMPC_linear,
+      // TODO: Clause::OMPC_induction,
+      Clause::OMPC_task_reduction,
+      Clause::OMPC_detach,
+      Clause::OMPC_use_device_ptr,
+      Clause::OMPC_is_device_ptr,
+  };
+
+  for (auto dir : llvm::enum_seq_inclusive<Directive>(Directive::First_,
+                                                      Directive::Last_)) {
+    bool allowsPrivatizing = llvm::any_of(privatizingClauses, [&](Clause cls) {
+      return llvm::omp::isAllowedClauseForDirective(dir, cls, version);
+    });
+    if (allowsPrivatizing)
+      constructs.insert(dir);
+  }
+}
+
 bool DataSharingProcessor::isOpenMPPrivatizingConstruct(
-    const parser::OpenMPConstruct &omp) {
-  return common::visit(
-      [](auto &&s) {
-        using BareS = llvm::remove_cvref_t<decltype(s)>;
-        return std::is_same_v<BareS, parser::OpenMPBlockConstruct> ||
-               std::is_same_v<BareS, parser::OpenMPLoopConstruct> ||
-               std::is_same_v<BareS, parser::OpenMPSectionsConstruct>;
-      },
-      omp.u);
+    const parser::OpenMPConstruct &omp, unsigned version) {
+  static llvm::SmallSet<llvm::omp::Directive, 16> privatizing;
+  [[maybe_unused]] static bool init =
+      (collectPrivatizingConstructs(privatizing, version), true);
+
+  // As of OpenMP 6.0, privatizing constructs (with the test being if they
+  // allow a privatizing clause) are: dispatch, distribute, do, for, loop,
+  // parallel, scope, sections, simd, single, target, target_data, task,
+  // taskgroup, taskloop, and teams.
+  return llvm::is_contained(privatizing, extractOmpDirective(omp));
 }
 
 bool DataSharingProcessor::isOpenMPPrivatizingEvaluation(
     const pft::Evaluation &eval) const {
-  return eval.visit([](auto &&s) {
+  unsigned version = semaCtx.langOptions().OpenMPVersion;
+  return eval.visit([=](auto &&s) {
     using BareS = llvm::remove_cvref_t<decltype(s)>;
     if constexpr (std::is_same_v<BareS, parser::OpenMPConstruct>) {
-      return isOpenMPPrivatizingConstruct(s);
+      return isOpenMPPrivatizingConstruct(s, version);
     } else {
       return false;
     }
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index ee2fc70d2e673..bc422f410403a 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -36,6 +36,8 @@ class DataSharingProcessor {
   /// at any point in time. This is used to track Symbol definition scopes in
   /// order to tell which OMP scope defined vs. references a certain Symbol.
   struct OMPConstructSymbolVisitor {
+    OMPConstructSymbolVisitor(semantics::SemanticsContext &ctx)
+        : version(ctx.langOptions().OpenMPVersion) {}
     template <typename T>
     bool Pre(const T &) {
       return true;
@@ -45,13 +47,13 @@ class DataSharingProcessor {
 
     bool Pre(const parser::OpenMPConstruct &omp) {
       // Skip constructs that may not have privatizations.
-      if (isOpenMPPrivatizingConstruct(omp))
+      if (isOpenMPPrivatizingConstruct(omp, version))
         constructs.push_back(&omp);
       return true;
     }
 
     void Post(const parser::OpenMPConstruct &omp) {
-      if (isOpenMPPrivatizingConstruct(omp))
+      if (isOpenMPPrivatizingConstruct(omp, version))
         constructs.pop_back();
     }
 
@@ -68,6 +70,9 @@ class DataSharingProcessor {
     /// construct that defines symbol.
     bool isSymbolDefineBy(const semantics::Symbol *symbol,
                           lower::pft::Evaluation &eval) const;
+
+  private:
+    unsigned version;
   };
 
   mlir::OpBuilder::InsertPoint lastPrivIP;
@@ -115,7 +120,8 @@ class DataSharingProcessor {
                              mlir::OpBuilder::InsertPoint *lastPrivIP);
   void insertDeallocs();
 
-  static bool isOpenMPPrivatizingConstruct(const parser::OpenMPConstruct &omp);
+  static bool isOpenMPPrivatizingConstruct(const parser::OpenMPConstruct &omp,
+                                           unsigned version);
   bool isOpenMPPrivatizingEvaluation(const pft::Evaluation &eval) const;
 
 public:
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 4458f62eea95a..fcb20fdf187ff 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -372,90 +372,6 @@ extractMappedBaseValues(llvm::ArrayRef<mlir::Value> vars,
   });
 }
 
-/// Get the directive enumeration value corresponding to the given OpenMP
-/// construct PFT node.
-llvm::omp::Directive
-extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
-  return common::visit(
-      common::visitors{
-          [](const parser::OpenMPAllocatorsConstruct &c) {
-            return llvm::omp::OMPD_allocators;
-          },
-          [](const parser::OpenMPAssumeConstruct &c) {
-            return llvm::omp::OMPD_assume;
-          },
-          [](const parser::OpenMPAtomicConstruct &c) {
-            return llvm::omp::OMPD_atomic;
-          },
-          [](const parser::OpenMPBlockConstruct &c) {
-            return std::get<parser::OmpBlockDirective>(
-                       std::get<parser::OmpBeginBlockDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPCriticalConstruct &c) {
-            return llvm::omp::OMPD_critical;
-          },
-          [](const parser::OpenMPDeclarativeAllocate &c) {
-            return llvm::omp::OMPD_allocate;
-          },
-          [](const parser::OpenMPDispatchConstruct &c) {
-            return llvm::omp::OMPD_dispatch;
-          },
-          [](const parser::OpenMPExecutableAllocate &c) {
-            return llvm::omp::OMPD_allocate;
-          },
-          [](const parser::OpenMPLoopConstruct &c) {
-            return std::get<parser::OmpLoopDirective>(
-                       std::get<parser::OmpBeginLoopDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPSectionConstruct &c) {
-            return llvm::omp::OMPD_section;
-          },
-          [](const parser::OpenMPSectionsConstruct &c) {
-            return std::get<parser::OmpSectionsDirective>(
-                       std::get<parser::OmpBeginSectionsDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPStandaloneConstruct &c) {
-            return common::visit(
-                common::visitors{
-                    [](const parser::OpenMPSimpleStandaloneConstruct &c) {
-                      return c.v.DirId();
-                    },
-                    [](const parser::OpenMPFlushConstruct &c) {
-                      return llvm::omp::OMPD_flush;
-                    },
-                    [](const parser::OpenMPCancelConstruct &c) {
-                      return llvm::omp::OMPD_cancel;
-                    },
-                    [](const parser::OpenMPCancellationPointConstruct &c) {
-                      return llvm::omp::OMPD_cancellation_point;
-                    },
-                    [](const parser::OmpMetadirectiveDirective &c) {
-                      return llvm::omp::OMPD_metadirective;
-                    },
-                    [](const parser::OpenMPDepobjConstruct &c) {
-                      return llvm::omp::OMPD_depobj;
-                    },
-                    [](const parser::OpenMPInteropConstruct &c) {
-                      return llvm::omp::OMPD_interop;
-                    }},
-                c.u);
-          },
-          [](const parser::OpenMPUtilityConstruct &c) {
-            return common::visit(
-                common::visitors{[](const parser::OmpErrorDirective &c) {
-                                   return llvm::omp::OMPD_error;
-                                 },
-                                 [](const parser::OmpNothingDirective &c) {
-                                   return llvm::omp::OMPD_nothing;
-                                 }},
-                c.u);
-          }},
-      ompConstruct.u);
-}
-
 /// Populate the global \see hostEvalInfo after processing clauses for the given
 /// \p eval OpenMP target construct, or nested constructs, if these must be
 /// evaluated outside of the target region per the spec.
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 2e53f01f1da6a..b194150c0f7f0 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -661,6 +661,90 @@ bool collectLoopRelatedInfo(
 
   return found;
 }
+
+/// Get the directive enumeration value corresponding to the given OpenMP
+/// construct PFT node.
+llvm::omp::Directive
+extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
+  return common::visit(
+      common::visitors{
+          [](const parser::OpenMPAllocatorsConstruct &c) {
+            return llvm::omp::OMPD_allocators;
+          },
+          [](const parser::OpenMPAssumeConstruct &c) {
+            return llvm::omp::OMPD_assume;
+          },
+          [](const parser::OpenMPAtomicConstruct &c) {
+            return llvm::omp::OMPD_atomic;
+          },
+          [](const parser::OpenMPBlockConstruct &c) {
+            return std::get<parser::OmpBlockDirective>(
+                       std::get<parser::OmpBeginBlockDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPCriticalConstruct &c) {
+            return llvm::omp::OMPD_critical;
+          },
+          [](const parser::OpenMPDeclarativeAllocate &c) {
+            return llvm::omp::OMPD_allocate;
+          },
+          [](const parser::OpenMPDispatchConstruct &c) {
+            return llvm::omp::OMPD_dispatch;
+          },
+          [](const parser::OpenMPExecutableAllocate &c) {
+            return llvm::omp::OMPD_allocate;
+          },
+          [](const parser::OpenMPLoopConstruct &c) {
+            return std::get<parser::OmpLoopDirective>(
+                       std::get<parser::OmpBeginLoopDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPSectionConstruct &c) {
+            return llvm::omp::OMPD_section;
+          },
+          [](const parser::OpenMPSectionsConstruct &c) {
+            return std::get<parser::OmpSectionsDirective>(
+                       std::get<parser::OmpBeginSectionsDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPStandaloneConstruct &c) {
+            return common::visit(
+                common::visitors{
+                    [](const parser::OpenMPSimpleStandaloneConstruct &c) {
+                      return c.v.DirId();
+                    },
+                    [](const parser::OpenMPFlushConstruct &c) {
+                      return llvm::omp::OMPD_flush;
+                    },
+                    [](const parser::OpenMPCancelConstruct &c) {
+                      return llvm::omp::OMPD_cancel;
+                    },
+                    [](const parser::OpenMPCancellationPointConstruct &c) {
+                      return llvm::omp::OMPD_cancellation_point;
+                    },
+                    [](const parser::OmpMetadirectiveDirective &c) {
+                      return llvm::omp::OMPD_metadirective;
+                    },
+                    [](const parser::OpenMPDepobjConstruct &c) {
+                      return llvm::omp::OMPD_depobj;
+                    },
+                    [](const parser::OpenMPInteropConstruct &c) {
+                      return llvm::omp::OMPD_interop;
+                    }},
+                c.u);
+          },
+          [](const parser::OpenMPUtilityConstruct &c) {
+            return common::visit(
+                common::visitors{[](const parser::OmpErrorDirective &c) {
+                                   return llvm::omp::OMPD_error;
+                                 },
+                                 [](const parser::OmpNothingDirective &c) {
+                                   return llvm::omp::OMPD_nothing;
+                                 }},
+                c.u);
+          }},
+      ompConstruct.u);
+}
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 1526bd4e90233..8e3ad5c3452e2 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -166,6 +166,9 @@ bool collectLoopRelatedInfo(
     lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
+
+llvm::omp::Directive
+extractOmpDirective(const parser::OpenMPConstruct &ompConstruct);
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index e59a6bf2bf224..fb6f0dbf719fb 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1403,7 +1403,7 @@ hlfir::Entity hlfir::createStackTempFromMold(mlir::Location loc,
         builder.createTemporary(loc, sequenceType, tmpName, extents, lenParams);
   } else {
     alloc = builder.createTemporary(loc, mold.getFortranElementType(), tmpName,
-                                    /*shape=*/std::nullopt, lenParams);
+                                    /*shape=*/{}, lenParams);
   }
   auto declareOp =
       builder.create<hlfir::DeclareOp>(loc, alloc, tmpName, shape, lenParams,
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d32c1fde59f27..d77a656158a37 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -385,13 +385,18 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genChdir,
      {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"clock64", &I::genClock64, {}, /*isElemental=*/false},
+    {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false},
+    {"clock64",
+     &I::genNVVMTime<mlir::NVVM::Clock64Op>,
+     {},
+     /*isElemental=*/false},
     {"cmplx",
      &I::genCmplx,
      {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}},
     {"command_argument_count", &I::genCommandArgumentCount},
     {"conjg", &I::genConjg},
     {"cosd", &I::genCosd},
+    {"cospi", &I::genCospi},
     {"count",
      &I::genCount,
      {{{"mask", asAddr}, {"dim", asValue}, {"kind", asValue}}},
@@ -503,6 +508,10 @@ static constexpr IntrinsicHandler handlers[]{
     {"getgid", &I::genGetGID},
     {"getpid", &I::genGetPID},
     {"getuid", &I::genGetUID},
+    {"globaltimer",
+     &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>,
+     {},
+     /*isElemental=*/false},
     {"hostnm",
      &I::genHostnm,
      {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -1231,8 +1240,11 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
   llvm::StringRef mathLibFuncName = mathOp.runtimeFunc;
   if (!mathLibFuncName.empty()) {
     // If we enabled MLIR complex or can use approximate operations, we should
-    // NOT use libm.
-    if (!forceMlirComplex && !canUseApprox) {
+    // NOT use libm. Avoid libm when targeting AMDGPU as those symbols are not
+    // available on the device and we rely on MLIR complex operations to
+    // later map to OCML calls.
+    bool isAMDGPU = fir::getTargetTriple(builder.getModule()).isAMDGCN();
+    if (!forceMlirComplex && !canUseApprox && !isAMDGPU) {
       result = genLibCall(builder, loc, mathOp, mathLibFuncType, args);
       LLVM_DEBUG(result.dump(); llvm::dbgs() << "\n");
       return result;
@@ -3557,16 +3569,6 @@ IntrinsicLibrary::genChdir(std::optional<mlir::Type> resultType,
   return {};
 }
 
-// CLOCK64
-mlir::Value IntrinsicLibrary::genClock64(mlir::Type resultType,
-                                         llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.read.ptx.sreg.clock64";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::FunctionType ftype = mlir::FunctionType::get(context, {}, {resultType});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return builder.create<fir::CallOp>(loc, funcOp, args).getResult(0);
-}
-
 // CMPLX
 mlir::Value IntrinsicLibrary::genCmplx(mlir::Type resultType,
                                        llvm::ArrayRef<mlir::Value> args) {
@@ -3622,6 +3624,21 @@ mlir::Value IntrinsicLibrary::genCosd(mlir::Type resultType,
   return getRuntimeCallGenerator("cos", ftype)(builder, loc, {arg});
 }
 
+// COSPI
+mlir::Value IntrinsicLibrary::genCospi(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
+  llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi);
+  mlir::Value dfactor =
+      builder.createRealConstant(loc, mlir::Float64Type::get(context), pi);
+  mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor);
+  mlir::Value arg = builder.create<mlir::arith::MulFOp>(loc, args[0], factor);
+  return getRuntimeCallGenerator("cos", ftype)(builder, loc, {arg});
+}
+
 // COUNT
 fir::ExtendedValue
 IntrinsicLibrary::genCount(mlir::Type resultType,
@@ -7196,6 +7213,14 @@ IntrinsicLibrary::genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue> args) {
   return fir::MutableBoxValue(boxStorage, mold->nonDeferredLenParams(), {});
 }
 
+// CLOCK, CLOCK64, GLOBALTIMER
+template <typename OpTy>
+mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0 && "expect no arguments");
+  return builder.create<OpTy>(loc, resultType).getResult();
+}
+
 // PACK
 fir::ExtendedValue
 IntrinsicLibrary::genPack(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp
index 64d70d70829fb..3fb7fab099965 100644
--- a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp
@@ -31,8 +31,7 @@ mlir::func::FuncOp fir::factory::getRealloc(fir::FirOpBuilder &builder) {
 mlir::func::FuncOp
 fir::factory::getLlvmGetRounding(fir::FirOpBuilder &builder) {
   auto int32Ty = builder.getIntegerType(32);
-  auto funcTy =
-      mlir::FunctionType::get(builder.getContext(), std::nullopt, {int32Ty});
+  auto funcTy = mlir::FunctionType::get(builder.getContext(), {}, {int32Ty});
   return builder.createFunction(builder.getUnknownLoc(), "llvm.get.rounding",
                                 funcTy);
 }
@@ -40,8 +39,7 @@ fir::factory::getLlvmGetRounding(fir::FirOpBuilder &builder) {
 mlir::func::FuncOp
 fir::factory::getLlvmSetRounding(fir::FirOpBuilder &builder) {
   auto int32Ty = builder.getIntegerType(32);
-  auto funcTy =
-      mlir::FunctionType::get(builder.getContext(), {int32Ty}, std::nullopt);
+  auto funcTy = mlir::FunctionType::get(builder.getContext(), {int32Ty}, {});
   return builder.createFunction(builder.getUnknownLoc(), "llvm.set.rounding",
                                 funcTy);
 }
@@ -49,8 +47,8 @@ fir::factory::getLlvmSetRounding(fir::FirOpBuilder &builder) {
 mlir::func::FuncOp
 fir::factory::getLlvmInitTrampoline(fir::FirOpBuilder &builder) {
   auto ptrTy = builder.getRefType(builder.getIntegerType(8));
-  auto funcTy = mlir::FunctionType::get(builder.getContext(),
-                                        {ptrTy, ptrTy, ptrTy}, std::nullopt);
+  auto funcTy =
+      mlir::FunctionType::get(builder.getContext(), {ptrTy, ptrTy, ptrTy}, {});
   return builder.createFunction(builder.getUnknownLoc(), "llvm.init.trampoline",
                                 funcTy);
 }
@@ -90,8 +88,7 @@ mlir::func::FuncOp fir::factory::getFeenableexcept(fir::FirOpBuilder &builder) {
 
 mlir::func::FuncOp fir::factory::getFegetexcept(fir::FirOpBuilder &builder) {
   auto int32Ty = builder.getIntegerType(32);
-  auto funcTy =
-      mlir::FunctionType::get(builder.getContext(), std::nullopt, {int32Ty});
+  auto funcTy = mlir::FunctionType::get(builder.getContext(), {}, {int32Ty});
   return builder.createFunction(builder.getUnknownLoc(), "fegetexcept", funcTy);
 }
 
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index d944a4c98473e..93abedc43936d 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -521,23 +521,23 @@ void fir::factory::associateMutableBox(fir::FirOpBuilder &builder,
         mlir::Value sourceBox;
         if (auto *polyBox = source.getBoxOf<fir::PolymorphicValue>())
           sourceBox = polyBox->getSourceBox();
-        writer.updateMutableBox(p.getAddr(), /*lbounds=*/std::nullopt,
-                                /*extents=*/std::nullopt,
-                                /*lengths=*/std::nullopt, sourceBox);
+        writer.updateMutableBox(p.getAddr(), /*lbounds=*/{},
+                                /*extents=*/{},
+                                /*lengths=*/{}, sourceBox);
       },
       [&](const fir::UnboxedValue &addr) {
-        writer.updateMutableBox(addr, /*lbounds=*/std::nullopt,
-                                /*extents=*/std::nullopt,
-                                /*lengths=*/std::nullopt);
+        writer.updateMutableBox(addr, /*lbounds=*/{},
+                                /*extents=*/{},
+                                /*lengths=*/{});
       },
       [&](const fir::CharBoxValue &ch) {
-        writer.updateMutableBox(ch.getAddr(), /*lbounds=*/std::nullopt,
-                                /*extents=*/std::nullopt, {ch.getLen()});
+        writer.updateMutableBox(ch.getAddr(), /*lbounds=*/{},
+                                /*extents=*/{}, {ch.getLen()});
       },
       [&](const fir::ArrayBoxValue &arr) {
         writer.updateMutableBox(arr.getAddr(),
                                 lbounds.empty() ? arr.getLBounds() : lbounds,
-                                arr.getExtents(), /*lengths=*/std::nullopt);
+                                arr.getExtents(), /*lengths=*/{});
       },
       [&](const fir::CharArrayBoxValue &arr) {
         writer.updateMutableBox(arr.getAddr(),
@@ -634,11 +634,11 @@ void fir::factory::associateMutableBoxWithRemap(
   source.match(
       [&](const fir::PolymorphicValue &p) {
         writer.updateMutableBox(cast(p.getAddr()), lbounds, extents,
-                                /*lengths=*/std::nullopt);
+                                /*lengths=*/{});
       },
       [&](const fir::UnboxedValue &addr) {
         writer.updateMutableBox(cast(addr), lbounds, extents,
-                                /*lengths=*/std::nullopt);
+                                /*lengths=*/{});
       },
       [&](const fir::CharBoxValue &ch) {
         writer.updateMutableBox(cast(ch.getAddr()), lbounds, extents,
@@ -646,7 +646,7 @@ void fir::factory::associateMutableBoxWithRemap(
       },
       [&](const fir::ArrayBoxValue &arr) {
         writer.updateMutableBox(cast(arr.getAddr()), lbounds, extents,
-                                /*lengths=*/std::nullopt);
+                                /*lengths=*/{});
       },
       [&](const fir::CharArrayBoxValue &arr) {
         writer.updateMutableBox(cast(arr.getAddr()), lbounds, extents,
@@ -755,8 +755,8 @@ static mlir::Value allocateAndInitNewStorage(fir::FirOpBuilder &builder,
     // there is no way to know here if a derived type needs it or not. But the
     // information is available at compile time and could be reflected here
     // somehow.
-    mlir::Value irBox = createNewFirBox(builder, loc, box, newStorage,
-                                        std::nullopt, extents, lengths);
+    mlir::Value irBox =
+        createNewFirBox(builder, loc, box, newStorage, {}, extents, lengths);
     fir::runtime::genDerivedTypeInitialize(builder, loc, irBox);
   }
   return newStorage;
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
index a943469a76728..62a0652cc2e5d 100644
--- a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -47,3 +47,18 @@ void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
       builder, loc, fTy, desc, sourceFile, sourceLine)};
   builder.create<fir::CallOp>(loc, func, args);
 }
+
+void fir::runtime::cuda::genSetAllocatorIndex(fir::FirOpBuilder &builder,
+                                              mlir::Location loc,
+                                              mlir::Value desc,
+                                              mlir::Value index) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(CUFSetAllocatorIndex)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, desc, index, sourceFile, sourceLine)};
+  builder.create<fir::CallOp>(loc, func, args);
+}
diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
index 980307db315d9..16c7944a885a1 100644
--- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -34,6 +34,7 @@ add_flang_library(FIRCodeGen
 
   MLIR_LIBS
   MLIRComplexToLLVM
+  MLIRComplexToROCDLLibraryCalls
   MLIRComplexToStandard
   MLIRGPUDialect
   MLIRMathToFuncs
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index ecc04a6c9a2be..d879382555c39 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -33,6 +33,7 @@
 #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
+#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
@@ -1122,6 +1123,16 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
     for (mlir::Value opnd : adaptor.getOperands())
       size = rewriter.create<mlir::LLVM::MulOp>(
           loc, ity, size, integerCast(loc, rewriter, ity, opnd));
+
+    // As the return value of malloc(0) is implementation defined, allocate one
+    // byte to ensure the allocation status being true. This behavior aligns to
+    // what the runtime has.
+    mlir::Value zero = genConstantIndex(loc, ity, rewriter, 0);
+    mlir::Value one = genConstantIndex(loc, ity, rewriter, 1);
+    mlir::Value cmp = rewriter.create<mlir::LLVM::ICmpOp>(
+        loc, mlir::LLVM::ICmpPredicate::sgt, size, zero);
+    size = rewriter.create<mlir::LLVM::SelectOp>(loc, cmp, size, one);
+
     auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
@@ -4145,22 +4156,24 @@ class FIRToLLVMLowering
     // conversions that affect the ModuleOp, e.g. create new
     // function operations in it. We have to run such conversions
     // as passes here.
-    mlir::OpPassManager mathConvertionPM("builtin.module");
+    mlir::OpPassManager mathConversionPM("builtin.module");
 
     bool isAMDGCN = fir::getTargetTriple(mod).isAMDGCN();
     // If compiling for AMD target some math operations must be lowered to AMD
     // GPU library calls, the rest can be converted to LLVM intrinsics, which
     // is handled in the mathToLLVM conversion. The lowering to libm calls is
     // not needed since all math operations are handled this way.
-    if (isAMDGCN)
-      mathConvertionPM.addPass(mlir::createConvertMathToROCDL());
+    if (isAMDGCN) {
+      mathConversionPM.addPass(mlir::createConvertMathToROCDL());
+      mathConversionPM.addPass(mlir::createConvertComplexToROCDLLibraryCalls());
+    }
 
     // Convert math::FPowI operations to inline implementation
     // only if the exponent's width is greater than 32, otherwise,
     // it will be lowered to LLVM intrinsic operation by a later conversion.
     mlir::ConvertMathToFuncsOptions mathToFuncsOptions{};
     mathToFuncsOptions.minWidthOfFPowIExponent = 33;
-    mathConvertionPM.addPass(
+    mathConversionPM.addPass(
         mlir::createConvertMathToFuncs(mathToFuncsOptions));
 
     mlir::ConvertComplexToStandardPassOptions complexToStandardOptions{};
@@ -4173,15 +4186,15 @@ class FIRToLLVMLowering
       complexToStandardOptions.complexRange =
           mlir::complex::ComplexRangeFlags::improved;
     }
-    mathConvertionPM.addPass(
+    mathConversionPM.addPass(
         mlir::createConvertComplexToStandardPass(complexToStandardOptions));
 
     // Convert Math dialect operations into LLVM dialect operations.
     // There is no way to prefer MathToLLVM patterns over MathToLibm
     // patterns (applied below), so we have to run MathToLLVM conversion here.
-    mathConvertionPM.addNestedPass<mlir::func::FuncOp>(
+    mathConversionPM.addNestedPass<mlir::func::FuncOp>(
         mlir::createConvertMathToLLVMPass());
-    if (mlir::failed(runPipeline(mathConvertionPM, mod)))
+    if (mlir::failed(runPipeline(mathConversionPM, mod)))
       return signalPassFailure();
 
     std::optional<mlir::DataLayout> dl =
diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
index e34771c67b0c3..d2cf85bedd54c 100644
--- a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
+++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp
@@ -63,13 +63,14 @@ class PackArrayConversion : public mlir::OpRewritePattern<fir::PackArrayOp> {
   static constexpr llvm::StringRef bufferName = ".repacked";
 
   // Return value of fir::BaseBoxType that represents a temporary
-  // array created for the original box with given extents and
-  // type parameters. The new box has the default lower bounds.
-  // If useStack is true, then the temporary will be allocated
+  // array created for the original box with given lbounds/extents and
+  // type parameters. The new box has the same shape as the original
+  // array. If useStack is true, then the temporary will be allocated
   // in stack memory (when possible).
   static mlir::Value allocateTempBuffer(fir::FirOpBuilder &builder,
                                         mlir::Location loc, bool useStack,
                                         mlir::Value origBox,
+                                        llvm::ArrayRef<mlir::Value> lbounds,
                                         llvm::ArrayRef<mlir::Value> extents,
                                         llvm::ArrayRef<mlir::Value> typeParams);
 
@@ -99,7 +100,9 @@ class UnpackArrayConversion
 // the presence of the stack attribute does not automatically
 // mean that the allocation is actually done in stack memory.
 // For example, we always do the heap allocation for polymorphic
-// types using Fortran runtime.
+// types using Fortran runtime. Currently, we allocate all
+// repack temporaries of derived types as polymorphic,
+// so that we can preserve the dynamic type of the original.
 // Adding the polymorpic mold to fir.alloca and then using
 // Fortran runtime to compute the allocation size could probably
 // resolve this limitation.
@@ -170,7 +173,8 @@ PackArrayConversion::matchAndRewrite(fir::PackArrayOp op,
 
 mlir::Value PackArrayConversion::allocateTempBuffer(
     fir::FirOpBuilder &builder, mlir::Location loc, bool useStack,
-    mlir::Value origBox, llvm::ArrayRef<mlir::Value> extents,
+    mlir::Value origBox, llvm::ArrayRef<mlir::Value> lbounds,
+    llvm::ArrayRef<mlir::Value> extents,
     llvm::ArrayRef<mlir::Value> typeParams) {
   auto tempType = mlir::cast<fir::SequenceType>(
       fir::extractSequenceType(origBox.getType()));
@@ -191,16 +195,35 @@ mlir::Value PackArrayConversion::allocateTempBuffer(
     assert(!isHeapAllocation && "temp must have been allocated on the stack");
 
   mlir::Type ptrType = base.getType();
-  if (llvm::isa<fir::BaseBoxType>(ptrType))
-    return base;
+  if (auto tempBoxType = mlir::dyn_cast<fir::BaseBoxType>(ptrType)) {
+    // We need to reset the CFI_attribute_allocatable before
+    // returning the temporary box to avoid any mishandling
+    // of the temporary box in Fortran runtime.
+    base = builder.create<fir::BoxAddrOp>(loc, fir::boxMemRefType(tempBoxType),
+                                          base);
+    ptrType = base.getType();
+  }
 
-  mlir::Type tempBoxType = fir::BoxType::get(mlir::isa<fir::HeapType>(ptrType)
-                                                 ? ptrType
-                                                 : fir::unwrapRefType(ptrType));
+  // Create the temporary using dynamic type of the original,
+  // if it is polymorphic, or it has a derived type with SEQUENCE
+  // or BIND attribute (such dummy arguments may have their dynamic
+  // type not exactly matching their static type).
+  // Note that for the latter case, the allocation can still be done
+  // without the mold, because the dynamic and static types
+  // must be storage compatible.
+  bool useDynamicType = fir::isBoxedRecordType(origBox.getType()) ||
+                        fir::isPolymorphicType(origBox.getType());
+  mlir::Type tempBoxType =
+      fir::wrapInClassOrBoxType(fir::unwrapRefType(ptrType),
+                                /*isPolymorphic=*/useDynamicType);
+  // Use the shape with proper lower bounds for the final box.
+  shape = builder.genShape(loc, lbounds, extents);
   mlir::Value newBox =
       builder.createBox(loc, tempBoxType, base, shape, /*slice=*/nullptr,
-                        typeParams, /*tdesc=*/nullptr);
-  return newBox;
+                        typeParams, useDynamicType ? origBox : nullptr);
+  // The new box might be !fir.class, while the original might be
+  // !fir.box - we have to add a conversion.
+  return builder.createConvert(loc, origBox.getType(), newBox);
 }
 
 mlir::FailureOr<mlir::Value>
@@ -280,16 +303,11 @@ PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder,
                             << op.getOperation() << '\n';
   }
 
-  mlir::Value tempBox =
-      allocateTempBuffer(builder, loc, op.getStack(), box, extents, typeParams);
+  mlir::Value tempBox = allocateTempBuffer(builder, loc, op.getStack(), box,
+                                           lbounds, extents, typeParams);
   if (!op.getNoCopy())
     fir::runtime::genShallowCopy(builder, loc, tempBox, box,
                                  /*resultIsAllocated=*/true);
-
-  // Set lower bounds after the original box.
-  mlir::Value shift = builder.genShift(loc, lbounds);
-  tempBox = builder.create<fir::ReboxOp>(loc, boxType, tempBox, shift,
-                                         /*slice=*/nullptr);
   builder.create<fir::ResultOp>(loc, tempBox);
 
   return ifOp.getResult(0);
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 7dbf21ce0c125..b60a72e4340b9 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -1443,14 +1443,35 @@ struct TargetAMDGPU : public GenericTarget<TargetAMDGPU> {
   CodeGenSpecifics::Marshalling
   complexArgumentType(mlir::Location loc, mlir::Type eleTy) const override {
     CodeGenSpecifics::Marshalling marshal;
-    TODO(loc, "handle complex argument types");
+    const auto *sem = &floatToSemantics(kindMap, eleTy);
+    if (sem == &llvm::APFloat::IEEEsingle()) {
+      // Lower COMPLEX(KIND=4) as an array of two element values.
+      marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{});
+    } else if (sem == &llvm::APFloat::IEEEdouble()) {
+      // Pass COMPLEX(KIND=8) as two separate arguments.
+      marshal.emplace_back(eleTy, AT{});
+      marshal.emplace_back(eleTy, AT{});
+    } else {
+      typeTodo(sem, loc, "argument");
+    }
     return marshal;
   }
 
   CodeGenSpecifics::Marshalling
   complexReturnType(mlir::Location loc, mlir::Type eleTy) const override {
     CodeGenSpecifics::Marshalling marshal;
-    TODO(loc, "handle complex return types");
+    const auto *sem = &floatToSemantics(kindMap, eleTy);
+    if (sem == &llvm::APFloat::IEEEsingle()) {
+      // Return COMPLEX(KIND=4) as an array of two elements.
+      marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{});
+    } else if (sem == &llvm::APFloat::IEEEdouble()) {
+      // Return COMPLEX(KIND=8) via an aggregate with two fields.
+      marshal.emplace_back(mlir::TupleType::get(eleTy.getContext(),
+                                                mlir::TypeRange{eleTy, eleTy}),
+                           AT{});
+    } else {
+      typeTodo(sem, loc, "return");
+    }
     return marshal;
   }
 };
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 687007d957225..ade80716f2561 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -345,6 +345,17 @@ llvm::LogicalResult cuf::StreamCastOp::verify() {
   return checkStreamType(*this);
 }
 
+//===----------------------------------------------------------------------===//
+// SetAllocatorOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult cuf::SetAllocatorIndexOp::verify() {
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
+    return emitOpError(
+        "expect box to be a reference to class or box type value");
+  return mlir::success();
+}
+
 // Tablegen operators
 
 #define GET_OP_CLASSES
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index b6bf2753b80ce..cf20d84cbbcdb 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4448,7 +4448,7 @@ llvm::LogicalResult fir::UnboxProcOp::verify() {
 
 void fir::IfOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
                       mlir::Value cond, bool withElseRegion) {
-  build(builder, result, std::nullopt, cond, withElseRegion);
+  build(builder, result, {}, cond, withElseRegion);
 }
 
 void fir::IfOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 58f2b57712974..00ca6731c035b 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -296,7 +296,7 @@ struct SetLengthOpConversion
     llvm::StringRef tmpName{".tmp"};
     llvm::SmallVector<mlir::Value, 1> lenParams{adaptor.getLength()};
     auto alloca = builder.createTemporary(loc, charType, tmpName,
-                                          /*shape=*/std::nullopt, lenParams);
+                                          /*shape=*/{}, lenParams);
     auto declareOp = builder.create<hlfir::DeclareOp>(
         loc, alloca, tmpName, /*shape=*/mlir::Value{}, lenParams,
         /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{});
diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
index bfb0daeacb8c3..35badb6eadb1c 100644
--- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
@@ -53,21 +53,26 @@ static void processAddrOfOp(fir::AddrOfOp addrOfOp,
   }
 }
 
+static void processTypeDescriptor(fir::RecordType recTy,
+                                  mlir::SymbolTable &symbolTable,
+                                  llvm::DenseSet<fir::GlobalOp> &candidates) {
+  if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
+          fir::NameUniquer::getTypeDescriptorName(recTy.getName()))) {
+    if (!candidates.contains(globalOp)) {
+      globalOp.walk([&](fir::AddrOfOp op) {
+        processAddrOfOp(op, symbolTable, candidates,
+                        /*recurseInGlobal=*/true);
+      });
+      candidates.insert(globalOp);
+    }
+  }
+}
+
 static void processEmboxOp(fir::EmboxOp emboxOp, mlir::SymbolTable &symbolTable,
                            llvm::DenseSet<fir::GlobalOp> &candidates) {
   if (auto recTy = mlir::dyn_cast<fir::RecordType>(
-          fir::unwrapRefType(emboxOp.getMemref().getType()))) {
-    if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
-            fir::NameUniquer::getTypeDescriptorName(recTy.getName()))) {
-      if (!candidates.contains(globalOp)) {
-        globalOp.walk([&](fir::AddrOfOp op) {
-          processAddrOfOp(op, symbolTable, candidates,
-                          /*recurseInGlobal=*/true);
-        });
-        candidates.insert(globalOp);
-      }
-    }
-  }
+          fir::unwrapRefType(emboxOp.getMemref().getType())))
+    processTypeDescriptor(recTy, symbolTable, candidates);
 }
 
 static void
@@ -85,6 +90,17 @@ prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp,
   }
 }
 
+static void
+processPotentialTypeDescriptor(mlir::Type candidateType,
+                               mlir::SymbolTable &symbolTable,
+                               llvm::DenseSet<fir::GlobalOp> &candidates) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(candidateType))
+    candidateType = boxTy.getEleTy();
+  candidateType = fir::unwrapSequenceType(fir::unwrapRefType(candidateType));
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(candidateType))
+    processTypeDescriptor(recTy, symbolTable, candidates);
+}
+
 class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
 public:
   void runOnOperation() override {
@@ -115,6 +131,8 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
     for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
       if (cuf::isRegisteredDeviceGlobal(globalOp)) {
         candidates.insert(globalOp);
+        processPotentialTypeDescriptor(globalOp.getType(), parentSymTable,
+                                       candidates);
       } else if (globalOp.getConstant() &&
                  mlir::isa<fir::SequenceType>(
                      fir::unwrapRefType(globalOp.resultType()))) {
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 0fff06033b73d..750569c126642 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -22,6 +22,7 @@
 #include "flang/Runtime/CUDA/memory.h"
 #include "flang/Runtime/CUDA/pointer.h"
 #include "flang/Runtime/allocatable.h"
+#include "flang/Runtime/allocator-registry-consts.h"
 #include "flang/Support/Fortran.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
@@ -923,6 +924,34 @@ struct CUFSyncDescriptorOpConversion
   }
 };
 
+struct CUFSetAllocatorIndexOpConversion
+    : public mlir::OpRewritePattern<cuf::SetAllocatorIndexOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::SetAllocatorIndexOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    int idx = kDefaultAllocator;
+    if (op.getDataAttr() == cuf::DataAttribute::Device) {
+      idx = kDeviceAllocatorPos;
+    } else if (op.getDataAttr() == cuf::DataAttribute::Managed) {
+      idx = kManagedAllocatorPos;
+    } else if (op.getDataAttr() == cuf::DataAttribute::Unified) {
+      idx = kUnifiedAllocatorPos;
+    } else if (op.getDataAttr() == cuf::DataAttribute::Pinned) {
+      idx = kPinnedAllocatorPos;
+    }
+    mlir::Value index =
+        builder.createIntegerConstant(loc, builder.getI32Type(), idx);
+    fir::runtime::cuda::genSetAllocatorIndex(builder, loc, op.getBox(), index);
+    op.erase();
+    return mlir::success();
+  }
+};
+
 class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
 public:
   void runOnOperation() override {
@@ -984,8 +1013,8 @@ void cuf::populateCUFToFIRConversionPatterns(
     const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) {
   patterns.insert<CUFAllocOpConversion>(patterns.getContext(), &dl, &converter);
   patterns.insert<CUFAllocateOpConversion, CUFDeallocateOpConversion,
-                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion>(
-      patterns.getContext());
+                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion,
+                  CUFSetAllocatorIndexOpConversion>(patterns.getContext());
   patterns.insert<CUFDataTransferOpConversion>(patterns.getContext(), symtab,
                                                &dl, &converter);
   patterns.insert<CUFLaunchOpConversion, CUFDeviceAddressOpConversion>(
diff --git a/flang/lib/Parser/message.cpp b/flang/lib/Parser/message.cpp
index 909fba948a45a..2a8101dd0b810 100644
--- a/flang/lib/Parser/message.cpp
+++ b/flang/lib/Parser/message.cpp
@@ -453,7 +453,7 @@ void Messages::ResolveProvenances(const AllCookedSources &allCooked) {
 
 void Messages::Emit(llvm::raw_ostream &o, const AllCookedSources &allCooked,
     bool echoSourceLines, const common::LanguageFeatureControl *hintFlagPtr,
-    std::size_t maxErrorsToEmit) const {
+    std::size_t maxErrorsToEmit, bool warningsAreErrors) const {
   std::vector<const Message *> sorted;
   for (const auto &msg : messages_) {
     sorted.push_back(&msg);
@@ -469,7 +469,7 @@ void Messages::Emit(llvm::raw_ostream &o, const AllCookedSources &allCooked,
     }
     msg->Emit(o, allCooked, echoSourceLines, hintFlagPtr);
     lastMsg = msg;
-    if (msg->IsFatal()) {
+    if (warningsAreErrors || msg->IsFatal()) {
       ++errorsEmitted;
     }
     // If maxErrorsToEmit is 0, emit all errors, otherwise break after
@@ -491,7 +491,18 @@ void Messages::AttachTo(Message &msg, std::optional<Severity> severity) {
   messages_.clear();
 }
 
-bool Messages::AnyFatalError() const {
+bool Messages::AnyFatalError(bool warningsAreErrors) const {
+  // Short-circuit in the most common case.
+  if (messages_.empty()) {
+    return false;
+  }
+  // If warnings are errors and there are warnings or errors, this is fatal.
+  // This preserves the compiler's current behavior of treating any non-fatal
+  // message as a warning. We may want to refine this in the future.
+  if (warningsAreErrors) {
+    return true;
+  }
+  // Otherwise, check the message buffer for fatal errors.
   for (const auto &msg : messages_) {
     if (msg.IsFatal()) {
       return true;
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index d70aaab82cbab..76c9499410017 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -912,7 +912,7 @@ TYPE_PARSER(construct<OmpNumTasksClause>(
     scalarIntExpr))
 
 TYPE_PARSER(
-    construct<OmpObject>(designator) || construct<OmpObject>("/" >> name / "/"))
+    construct<OmpObject>(designator) || "/" >> construct<OmpObject>(name) / "/")
 
 // OMP 5.0 2.19.4.5 LASTPRIVATE ([lastprivate-modifier :] list)
 TYPE_PARSER(construct<OmpLastprivateClause>(
@@ -1757,6 +1757,13 @@ TYPE_PARSER(construct<OpenMPSectionsConstruct>(
     Parser<OmpBeginSectionsDirective>{} / endOmpLine,
     Parser<OmpSectionBlocks>{}, Parser<OmpEndSectionsDirective>{} / endOmpLine))
 
+static bool IsExecutionPart(const OmpDirectiveName &name) {
+  return name.IsExecutionPart();
+}
+
+TYPE_PARSER(construct<OpenMPExecDirective>(
+    startOmpLine >> predicated(Parser<OmpDirectiveName>{}, IsExecutionPart)))
+
 TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
     startOmpLine >>
         withMessage("expected OpenMP construct"_err_en_US,
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index 824612e49293f..7b46c9f469b06 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -368,6 +368,53 @@ llvm::omp::Clause OmpClause::Id() const {
   return std::visit([](auto &&s) { return getClauseIdForClass(s); }, u);
 }
 
+bool OmpDirectiveName::IsExecutionPart() const {
+  // Can the directive appear in the execution part of the program.
+  llvm::omp::Directive id{v};
+  switch (llvm::omp::getDirectiveCategory(id)) {
+  case llvm::omp::Category::Executable:
+    return true;
+  case llvm::omp::Category::Declarative:
+    switch (id) {
+    case llvm::omp::Directive::OMPD_allocate:
+      return true;
+    default:
+      return false;
+    }
+    break;
+  case llvm::omp::Category::Informational:
+    switch (id) {
+    case llvm::omp::Directive::OMPD_assume:
+      return true;
+    default:
+      return false;
+    }
+    break;
+  case llvm::omp::Category::Meta:
+    return true;
+  case llvm::omp::Category::Subsidiary:
+    switch (id) {
+    // TODO: case llvm::omp::Directive::OMPD_task_iteration:
+    case llvm::omp::Directive::OMPD_section:
+    case llvm::omp::Directive::OMPD_scan:
+      return true;
+    default:
+      return false;
+    }
+    break;
+  case llvm::omp::Category::Utility:
+    switch (id) {
+    case llvm::omp::Directive::OMPD_error:
+    case llvm::omp::Directive::OMPD_nothing:
+      return true;
+    default:
+      return false;
+    }
+    break;
+  }
+  return false;
+}
+
 const OmpArgumentList &OmpDirectiveSpecification::Arguments() const {
   static OmpArgumentList empty{decltype(OmpArgumentList::v){}};
   if (auto &arguments = std::get<std::optional<OmpArgumentList>>(t)) {
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index ec894ab8513d2..3a9a475c365ee 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -179,7 +179,9 @@ void Prescanner::Statement() {
         EmitChar(tokens, *sp);
       }
       if (inFixedForm_) {
-        while (column_ < 6) {
+        // We need to add the whitespace after the sentinel because otherwise
+        // the line cannot be re-categorised as a compiler directive.
+        while (column_ <= 6) {
           if (*at_ == '\t') {
             tabInCurrentLine_ = true;
             ++at_;
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index d6e7f11297187..5f4e62ffdbbf2 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -108,7 +108,7 @@ constexpr auto actionStmtLookAhead{first(actionStmt >> ok,
     "ALLOCATE ("_tok, "CALL" >> name >> "("_tok, "GO TO"_tok, "OPEN ("_tok,
     "PRINT"_tok / space / !"("_tok, "READ ("_tok, "WRITE ("_tok)};
 constexpr auto execPartLookAhead{first(actionStmtLookAhead >> ok,
-    openaccConstruct >> ok, openmpConstruct >> ok, "ASSOCIATE ("_tok,
+    openaccConstruct >> ok, openmpExecDirective >> ok, "ASSOCIATE ("_tok,
     "BLOCK"_tok, "SELECT"_tok, "CHANGE TEAM"_sptok, "CRITICAL"_tok, "DO"_tok,
     "IF ("_tok, "WHERE ("_tok, "FORALL ("_tok, "!$CUF"_tok)};
 constexpr auto declErrorRecovery{
diff --git a/flang/lib/Parser/type-parsers.h b/flang/lib/Parser/type-parsers.h
index 8e91ede093fc4..3900c5a86c874 100644
--- a/flang/lib/Parser/type-parsers.h
+++ b/flang/lib/Parser/type-parsers.h
@@ -137,6 +137,7 @@ constexpr Parser<CompilerDirective> compilerDirective;
 constexpr Parser<OpenACCConstruct> openaccConstruct;
 constexpr Parser<OpenACCDeclarativeConstruct> openaccDeclarativeConstruct;
 constexpr Parser<OpenMPConstruct> openmpConstruct;
+constexpr Parser<OpenMPExecDirective> openmpExecDirective;
 constexpr Parser<OpenMPDeclarativeConstruct> openmpDeclarativeConstruct;
 constexpr Parser<OmpEndLoopDirective> ompEndLoopDirective;
 constexpr Parser<IntrinsicVectorTypeSpec> intrinsicVectorTypeSpec; // Extension
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index f9d64485f1407..a2f2906af10b8 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -151,8 +151,8 @@ class CheckHelper {
   void CheckProcedureAssemblyName(const Symbol &symbol);
   void CheckExplicitSave(const Symbol &);
   parser::Messages WhyNotInteroperableDerivedType(const Symbol &);
-  parser::Messages WhyNotInteroperableObject(
-      const Symbol &, bool allowNonInteroperableType = false);
+  parser::Messages WhyNotInteroperableObject(const Symbol &,
+      bool allowNonInteroperableType = false, bool forCommonBlock = false);
   parser::Messages WhyNotInteroperableFunctionResult(const Symbol &);
   parser::Messages WhyNotInteroperableProcedure(const Symbol &, bool isError);
   void CheckBindC(const Symbol &);
@@ -519,11 +519,35 @@ void CheckHelper::Check(const Symbol &symbol) {
 }
 
 void CheckHelper::CheckCommonBlock(const Symbol &symbol) {
+  auto restorer{messages_.SetLocation(symbol.name())};
   CheckGlobalName(symbol);
   if (symbol.attrs().test(Attr::BIND_C)) {
     CheckBindC(symbol);
+    for (auto ref : symbol.get<CommonBlockDetails>().objects()) {
+      if (ref->has<ObjectEntityDetails>()) {
+        if (auto msgs{WhyNotInteroperableObject(*ref,
+                /*allowInteroperableType=*/false, /*forCommonBlock=*/true)};
+            !msgs.empty()) {
+          parser::Message &reason{msgs.messages().front()};
+          parser::Message *msg{nullptr};
+          if (reason.IsFatal()) {
+            msg = messages_.Say(symbol.name(),
+                "'%s' may not be a member of BIND(C) COMMON block /%s/"_err_en_US,
+                ref->name(), symbol.name());
+          } else {
+            msg = messages_.Say(symbol.name(),
+                "'%s' should not be a member of BIND(C) COMMON block /%s/"_warn_en_US,
+                ref->name(), symbol.name());
+          }
+          if (msg) {
+            msg->Attach(
+                std::move(reason.set_severity(parser::Severity::Because)));
+          }
+        }
+      }
+    }
   }
-  for (MutableSymbolRef ref : symbol.get<CommonBlockDetails>().objects()) {
+  for (auto ref : symbol.get<CommonBlockDetails>().objects()) {
     if (ref->test(Symbol::Flag::CrayPointee)) {
       messages_.Say(ref->name(),
           "Cray pointee '%s' may not be a member of a COMMON block"_err_en_US,
@@ -3154,14 +3178,16 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
 }
 
 parser::Messages CheckHelper::WhyNotInteroperableObject(
-    const Symbol &symbol, bool allowNonInteroperableType) {
+    const Symbol &symbol, bool allowNonInteroperableType, bool forCommonBlock) {
   parser::Messages msgs;
-  if (examinedByWhyNotInteroperable_.find(symbol) !=
-      examinedByWhyNotInteroperable_.end()) {
-    return msgs;
+  if (!forCommonBlock) {
+    if (examinedByWhyNotInteroperable_.find(symbol) !=
+        examinedByWhyNotInteroperable_.end()) {
+      return msgs;
+    }
+    examinedByWhyNotInteroperable_.insert(symbol);
   }
   bool isExplicitBindC{symbol.attrs().test(Attr::BIND_C)};
-  examinedByWhyNotInteroperable_.insert(symbol);
   CHECK(symbol.has<ObjectEntityDetails>());
   if (isExplicitBindC && !symbol.owner().IsModule()) {
     msgs.Say(symbol.name(),
@@ -3258,7 +3284,7 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
     msgs.Say(symbol.name(),
         "An interoperable pointer must not be CONTIGUOUS"_err_en_US);
   }
-  if (msgs.AnyFatalError()) {
+  if (!forCommonBlock && msgs.AnyFatalError()) {
     examinedByWhyNotInteroperable_.erase(symbol);
   }
   return msgs;
@@ -3338,8 +3364,8 @@ parser::Messages CheckHelper::WhyNotInteroperableProcedure(
           // on the C side by either a cdesc_t * or a void *.  F'2023 18.3.7 (5)
           bool allowNonInteroperableType{!dummy->attrs().test(Attr::VALUE) &&
               (IsDescriptor(*dummy) || IsAssumedType(*dummy))};
-          dummyMsgs =
-              WhyNotInteroperableObject(*dummy, allowNonInteroperableType);
+          dummyMsgs = WhyNotInteroperableObject(
+              *dummy, allowNonInteroperableType, /*forCommonBlock=*/false);
         } else {
           CheckBindC(*dummy);
         }
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 2425265e196c6..e4a94efcc6b55 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1156,8 +1156,7 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar(
                         (sym->has<MainProgramDetails>() ||
                             sym->has<ModuleDetails>())) {
                       context_.Say(name->source,
-                          "The module name or main program name cannot be in a "
-                          "%s "
+                          "The module name cannot be in a %s "
                           "directive"_err_en_US,
                           ContextDirectiveAsFortran());
                     } else if (!IsSaved(*name->symbol) &&
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 53ec3827893d0..14473724f0f40 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -178,7 +178,7 @@ class ArgumentAnalyzer {
   }
   // Find and return a user-defined assignment
   std::optional<ProcedureRef> TryDefinedAssignment();
-  std::optional<ProcedureRef> GetDefinedAssignmentProc();
+  std::optional<ProcedureRef> GetDefinedAssignmentProc(bool &isAmbiguous);
   std::optional<DynamicType> GetType(std::size_t) const;
   void Dump(llvm::raw_ostream &);
 
@@ -191,7 +191,7 @@ class ArgumentAnalyzer {
   MaybeExpr AnalyzeExprOrWholeAssumedSizeArray(const parser::Expr &);
   bool AreConformable() const;
   const Symbol *FindBoundOp(parser::CharBlock, int passIndex,
-      const Symbol *&generic, bool isSubroutine);
+      const Symbol *&generic, bool isSubroutine, bool *isAmbiguous = nullptr);
   void AddAssignmentConversion(
       const DynamicType &lhsType, const DynamicType &rhsType);
   bool OkLogicalIntegerAssignment(TypeCategory lhs, TypeCategory rhs);
@@ -199,7 +199,8 @@ class ArgumentAnalyzer {
   bool IsBOZLiteral(std::size_t i) const {
     return evaluate::IsBOZLiteral(GetExpr(i));
   }
-  void SayNoMatch(const std::string &, bool isAssignment = false);
+  void SayNoMatch(
+      const std::string &, bool isAssignment = false, bool isAmbiguous = false);
   std::string TypeAsFortran(std::size_t);
   bool AnyUntypedOrMissingOperand();
 
@@ -4781,7 +4782,9 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
     return std::nullopt; // user-defined assignment not allowed for these args
   }
   auto restorer{context_.GetContextualMessages().SetLocation(source_)};
-  if (std::optional<ProcedureRef> procRef{GetDefinedAssignmentProc()}) {
+  bool isAmbiguous{false};
+  if (std::optional<ProcedureRef> procRef{
+          GetDefinedAssignmentProc(isAmbiguous)}) {
     if (context_.inWhereBody() && !procRef->proc().IsElemental()) { // C1032
       context_.Say(
           "Defined assignment in WHERE must be elemental, but '%s' is not"_err_en_US,
@@ -4791,9 +4794,11 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
     return std::move(*procRef);
   }
   if (isDefined == Tristate::Yes) {
-    if (!lhsType || !rhsType || (lhsRank != rhsRank && rhsRank != 0) ||
+    if (isAmbiguous || !lhsType || !rhsType ||
+        (lhsRank != rhsRank && rhsRank != 0) ||
         !OkLogicalIntegerAssignment(lhsType->category(), rhsType->category())) {
-      SayNoMatch("ASSIGNMENT(=)", true);
+      SayNoMatch(
+          "ASSIGNMENT(=)", /*isAssignment=*/true, /*isAmbiguous=*/isAmbiguous);
     }
   } else if (!fatalErrors_) {
     CheckAssignmentConformance();
@@ -4822,13 +4827,15 @@ bool ArgumentAnalyzer::OkLogicalIntegerAssignment(
   return true;
 }
 
-std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc() {
+std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc(
+    bool &isAmbiguous) {
   const Symbol *proc{nullptr};
   bool isProcElemental{false};
   std::optional<int> passedObjectIndex;
   std::string oprNameString{"assignment(=)"};
   parser::CharBlock oprName{oprNameString};
   const auto &scope{context_.context().FindScope(source_)};
+  isAmbiguous = false;
   {
     auto restorer{context_.GetContextualMessages().DiscardMessages()};
     if (const Symbol *symbol{scope.FindSymbol(oprName)}) {
@@ -4842,8 +4849,8 @@ std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc() {
     for (std::size_t i{0}; (!proc || isProcElemental) && i < actuals_.size();
         ++i) {
       const Symbol *generic{nullptr};
-      if (const Symbol *
-          binding{FindBoundOp(oprName, i, generic, /*isSubroutine=*/true)}) {
+      if (const Symbol *binding{FindBoundOp(oprName, i, generic,
+              /*isSubroutine=*/true, /*isAmbiguous=*/&isAmbiguous)}) {
         // ignore inaccessible type-bound ASSIGNMENT(=) generic
         if (!CheckAccessibleSymbol(scope, DEREF(generic))) {
           const Symbol *resolution{GetBindingResolution(GetType(i), *binding)};
@@ -4967,7 +4974,8 @@ bool ArgumentAnalyzer::AreConformable() const {
 
 // Look for a type-bound operator in the type of arg number passIndex.
 const Symbol *ArgumentAnalyzer::FindBoundOp(parser::CharBlock oprName,
-    int passIndex, const Symbol *&generic, bool isSubroutine) {
+    int passIndex, const Symbol *&generic, bool isSubroutine,
+    bool *isAmbiguous) {
   const auto *type{GetDerivedTypeSpec(GetType(passIndex))};
   const semantics::Scope *scope{type ? type->scope() : nullptr};
   if (scope) {
@@ -4989,6 +4997,9 @@ const Symbol *ArgumentAnalyzer::FindBoundOp(parser::CharBlock oprName,
       // Use the most recent override of the binding, if any
       return scope->FindComponent(binding->name());
     } else {
+      if (isAmbiguous) {
+        *isAmbiguous = pair.second;
+      }
       context_.EmitGenericResolutionError(*generic, pair.second, isSubroutine);
     }
   }
@@ -5072,40 +5083,37 @@ void ArgumentAnalyzer::ConvertBOZAssignmentRHS(const DynamicType &lhsType) {
 }
 
 // Report error resolving opr when there is a user-defined one available
-void ArgumentAnalyzer::SayNoMatch(const std::string &opr, bool isAssignment) {
+void ArgumentAnalyzer::SayNoMatch(
+    const std::string &opr, bool isAssignment, bool isAmbiguous) {
   std::string type0{TypeAsFortran(0)};
   auto rank0{actuals_[0]->Rank()};
+  std::string prefix{"No intrinsic or user-defined "s + opr + " matches"};
+  if (isAmbiguous) {
+    prefix = "Multiple specific procedures for the generic "s + opr + " match";
+  }
   if (actuals_.size() == 1) {
     if (rank0 > 0) {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "rank %d array of %s"_err_en_US,
-          opr, rank0, type0);
+      context_.Say("%s rank %d array of %s"_err_en_US, prefix, rank0, type0);
     } else {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "operand type %s"_err_en_US,
-          opr, type0);
+      context_.Say("%s operand type %s"_err_en_US, prefix, type0);
     }
   } else {
     std::string type1{TypeAsFortran(1)};
     auto rank1{actuals_[1]->Rank()};
     if (rank0 > 0 && rank1 > 0 && rank0 != rank1) {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "rank %d array of %s and rank %d array of %s"_err_en_US,
-          opr, rank0, type0, rank1, type1);
+      context_.Say("%s rank %d array of %s and rank %d array of %s"_err_en_US,
+          prefix, rank0, type0, rank1, type1);
     } else if (isAssignment && rank0 != rank1) {
       if (rank0 == 0) {
-        context_.Say("No intrinsic or user-defined %s matches "
-                     "scalar %s and rank %d array of %s"_err_en_US,
-            opr, type0, rank1, type1);
+        context_.Say("%s scalar %s and rank %d array of %s"_err_en_US, prefix,
+            type0, rank1, type1);
       } else {
-        context_.Say("No intrinsic or user-defined %s matches "
-                     "rank %d array of %s and scalar %s"_err_en_US,
-            opr, rank0, type0, type1);
+        context_.Say("%s rank %d array of %s and scalar %s"_err_en_US, prefix,
+            rank0, type0, type1);
       }
     } else {
-      context_.Say("No intrinsic or user-defined %s matches "
-                   "operand types %s and %s"_err_en_US,
-          opr, type0, type1);
+      context_.Say(
+          "%s operand types %s and %s"_err_en_US, prefix, type0, type1);
     }
   }
 }
diff --git a/flang/lib/Semantics/resolve-labels.cpp b/flang/lib/Semantics/resolve-labels.cpp
index b0cbc4b56e889..27e259fab3873 100644
--- a/flang/lib/Semantics/resolve-labels.cpp
+++ b/flang/lib/Semantics/resolve-labels.cpp
@@ -489,15 +489,30 @@ class ParseTreeAnalyzer {
 
   // C1401
   void Post(const parser::MainProgram &mainProgram) {
+    // Uppercase the name of the main program, so that its symbol name
+    // would be unique from similarly named non-main-program symbols.
+    auto upperCaseCharBlock = [](const parser::CharBlock &cb) {
+      char *ch{const_cast<char *>(cb.begin())};
+      char *endCh{ch + cb.size()};
+      while (ch != endCh) {
+        *ch++ = parser::ToUpperCaseLetter(*ch);
+      }
+    };
+    const parser::CharBlock *progName{nullptr};
+    if (const auto &program{
+            std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(
+                mainProgram.t)}) {
+      progName = &program->statement.v.source;
+      upperCaseCharBlock(*progName);
+    }
     if (const parser::CharBlock *
         endName{GetStmtName(std::get<parser::Statement<parser::EndProgramStmt>>(
             mainProgram.t))}) {
-      if (const auto &program{
-              std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(
-                  mainProgram.t)}) {
-        if (*endName != program->statement.v.source) {
+      upperCaseCharBlock(*endName);
+      if (progName) {
+        if (*endName != *progName) {
           context_.Say(*endName, "END PROGRAM name mismatch"_err_en_US)
-              .Attach(program->statement.v.source, "should be"_en_US);
+              .Attach(*progName, "should be"_en_US);
         }
       } else {
         context_.Say(*endName,
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 96faa5fd954cd..b3268605e7c0c 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -8574,8 +8574,10 @@ bool ResolveNamesVisitor::Pre(const parser::ImportStmt &x) {
         } else {
           Say(name,
               "A distinct '%s' is already present in this scope"_err_en_US)
-              .Attach(symbol->name(), "Previous declaration of '%s'"_en_US)
-              .Attach(outer->name(), "Declaration of '%s' in host scope"_en_US);
+              .Attach(symbol->name(), "Previous declaration of '%s'"_en_US,
+                  symbol->name().ToString())
+              .Attach(outer->name(), "Declaration of '%s' in host scope"_en_US,
+                  outer->name().ToString());
         }
       }
     } else {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 51ba21a9e5edf..5916a07df7744 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1131,7 +1131,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
   if (auto proc{evaluate::characteristics::Procedure::Characterize(
           specific, context_.foldingContext())}) {
     std::uint8_t isArgDescriptorSet{0};
-    std::uint8_t isArgContiguousSet{0};
+    bool specialCaseFlag{0};
     int argThatMightBeDescriptor{0};
     MaybeExpr which;
     if (isAssignment) {
@@ -1197,7 +1197,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
                         TypeAndShape::Attr::AssumedShape) ||
                 dummyData.attrs.test(evaluate::characteristics::
                         DummyDataObject::Attr::Contiguous)) {
-              isArgContiguousSet |= 1;
+              specialCaseFlag = true;
             }
           }
         }
@@ -1216,7 +1216,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         return;
       }
       if (ddo->type.type().IsPolymorphic()) {
-        isArgDescriptorSet |= 1;
+        argThatMightBeDescriptor = 1;
       }
       switch (io.value()) {
       case common::DefinedIo::ReadFormatted:
@@ -1232,6 +1232,9 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         which = writeUnformattedEnum_;
         break;
       }
+      if (context_.defaultKinds().GetDefaultKind(TypeCategory::Integer) == 8) {
+        specialCaseFlag = true; // UNIT= & IOSTAT= INTEGER(8)
+      }
     }
     if (argThatMightBeDescriptor != 0) {
       if (const auto *dummyData{
@@ -1262,8 +1265,8 @@ void RuntimeTableBuilder::DescribeSpecialProc(
     }
     CHECK(bindingIndex <= 255);
     AddValue(values, specialSchema_, "istypebound"s, IntExpr<1>(bindingIndex));
-    AddValue(values, specialSchema_, "isargcontiguousset"s,
-        IntExpr<1>(isArgContiguousSet));
+    AddValue(values, specialSchema_, "specialcaseflag"s,
+        IntExpr<1>(specialCaseFlag));
     AddValue(values, specialSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{specific}});
     // index might already be present in the case of an override
@@ -1383,19 +1386,26 @@ CollectNonTbpDefinedIoGenericInterfaces(
               } else {
                 // Local scope's specific overrides host's for this type
                 bool updated{false};
+                std::uint8_t flags{0};
+                if (declType->IsPolymorphic()) {
+                  flags |= IsDtvArgPolymorphic;
+                }
+                if (scope.context().GetDefaultKind(TypeCategory::Integer) ==
+                    8) {
+                  flags |= DefinedIoInteger8;
+                }
                 for (auto [iter, end]{result.equal_range(dtDesc)}; iter != end;
                      ++iter) {
                   NonTbpDefinedIo &nonTbp{iter->second};
                   if (nonTbp.definedIo == which) {
                     nonTbp.subroutine = &*specific;
-                    nonTbp.isDtvArgPolymorphic = declType->IsPolymorphic();
+                    nonTbp.flags = flags;
                     updated = true;
                   }
                 }
                 if (!updated) {
-                  result.emplace(dtDesc,
-                      NonTbpDefinedIo{
-                          &*specific, which, declType->IsPolymorphic()});
+                  result.emplace(
+                      dtDesc, NonTbpDefinedIo{&*specific, which, flags});
                 }
               }
             }
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index ab78605d01f4c..b15ed057b52f2 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -376,8 +376,7 @@ const DeclTypeSpec &SemanticsContext::MakeLogicalType(int kind) {
 }
 
 bool SemanticsContext::AnyFatalError() const {
-  return !messages_.empty() &&
-      (warningsAreErrors_ || messages_.AnyFatalError());
+  return messages_.AnyFatalError(warningsAreErrors_);
 }
 bool SemanticsContext::HasError(const Symbol &symbol) {
   return errorSymbols_.count(symbol) > 0;
@@ -658,7 +657,7 @@ void Semantics::EmitMessages(llvm::raw_ostream &os) {
   context_.messages().ResolveProvenances(context_.allCookedSources());
   context_.messages().Emit(os, context_.allCookedSources(),
       /*echoSourceLine=*/true, &context_.languageFeatures(),
-      /*maxErrorsToEmit=*/context_.maxErrors());
+      context_.maxErrors(), context_.warningsAreErrors());
 }
 
 void SemanticsContext::DumpSymbols(llvm::raw_ostream &os) {
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 5e5b43f26c791..5a5b02e1ac3ce 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -348,9 +348,9 @@ const Symbol &BypassGeneric(const Symbol &symbol) {
 
 const Symbol &GetCrayPointer(const Symbol &crayPointee) {
   const Symbol *found{nullptr};
-  for (const auto &[pointee, pointer] :
-      crayPointee.GetUltimate().owner().crayPointers()) {
-    if (pointee == crayPointee.name()) {
+  const Symbol &ultimate{crayPointee.GetUltimate()};
+  for (const auto &[pointee, pointer] : ultimate.owner().crayPointers()) {
+    if (pointee == ultimate.name()) {
       found = &pointer.get();
       break;
     }
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 8dd27d6e4c01b..6af2a5a5e30ff 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -118,7 +118,7 @@
     integer(1) :: which ! SpecialBinding::Which
     integer(1) :: isArgDescriptorSet
     integer(1) :: isTypeBound ! binding index + 1, if any
-    integer(1) :: isArgContiguousSet
+    integer(1) :: specialCaseFlag
     integer(1) :: __padding0(4)
     type(__builtin_c_funptr) :: proc
   end type
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index f8a30da8b9615..d0c312c09353f 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -957,11 +957,21 @@ attributes(device) pure integer function atomicxori(address, val)
 
   ! Time function
 
+  interface
+    attributes(device) integer function clock()
+    end function
+  end interface
+
   interface
     attributes(device) integer(8) function clock64()
     end function
   end interface
 
+  interface
+    attributes(device) integer(8) function globalTimer()
+    end function
+  end interface
+
   ! Warp Match Functions
 
   interface match_all_sync
diff --git a/flang/test/Driver/cuda-option.f90 b/flang/test/Driver/cuda-option.f90
index 0740ed509a077..f55e88dab20ce 100644
--- a/flang/test/Driver/cuda-option.f90
+++ b/flang/test/Driver/cuda-option.f90
@@ -8,7 +8,7 @@ program main
   integer, device :: dvar
 end program
 
-! CHECK-LABEL: PROGRAM main
+! CHECK-LABEL: PROGRAM MAIN
 ! CHECK: INTEGER :: var = 1
 ! CHECK: INTEGER, DEVICE :: dvar
 
diff --git a/flang/test/Driver/fatal-errors-warnings.f90 b/flang/test/Driver/fatal-errors-warnings.f90
new file mode 100644
index 0000000000000..2de09c3ed0778
--- /dev/null
+++ b/flang/test/Driver/fatal-errors-warnings.f90
@@ -0,0 +1,31 @@
+! RUN: %flang_fc1 -Wfatal-errors -pedantic %s 2>&1 | FileCheck %s --check-prefix=CHECK1
+! RUN: not %flang_fc1 -pedantic -Werror %s 2>&1 | FileCheck %s --check-prefix=CHECK2
+! RUN: not %flang_fc1 -Wfatal-errors -pedantic -Werror %s 2>&1 | FileCheck %s --check-prefix=CHECK3
+
+module m
+    contains
+    subroutine foo(a)
+        real, intent(in), target :: a(:)
+    end subroutine
+end module
+
+program test
+    use m
+    real, target :: a(1)
+    real :: b(1)
+    call foo(a) ! ok
+    !CHECK1: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK2: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK3: fatal-errors-warnings.f90:{{.*}} warning:
+    call foo(b)
+    !CHECK1: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK2: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK3-NOT: error:
+    !CHECK3-NOT: warning:
+    call foo((a))
+    !CHECK1: fatal-errors-warnings.f90:{{.*}} warning:
+    !CHECK2: fatal-errors-warnings.f90:{{.*}} warning:
+    call foo(a([1]))
+    !! Hard error instead of warning if uncommented.
+    !call foo(a(1))
+end
\ No newline at end of file
diff --git a/flang/test/Driver/unparse-use-analyzed.f95 b/flang/test/Driver/unparse-use-analyzed.f95
index eb6046aebba54..4bcd72c9a5f50 100644
--- a/flang/test/Driver/unparse-use-analyzed.f95
+++ b/flang/test/Driver/unparse-use-analyzed.f95
@@ -6,12 +6,12 @@
 ! RUN: %flang_fc1 -fdebug-unparse  %s | FileCheck %s --check-prefix=DEFAULT
 ! RUN: %flang_fc1 -fdebug-unparse -fno-analyzed-objects-for-unparse %s | FileCheck %s --check-prefix=DISABLED
 
-! DEFAULT: PROGRAM test
+! DEFAULT: PROGRAM TEST
 ! DEFAULT-NEXT:  REAL, PARAMETER :: val = 3.43e2_4
 ! DEFAULT-NEXT:  PRINT *, 3.47e2_4
 ! DEFAULT-NEXT: END PROGRAM
 
-! DISABLED: PROGRAM test
+! DISABLED: PROGRAM TEST
 ! DISABLED-NEXT:  REAL, PARAMETER :: val = 343.0
 ! DISABLED-NEXT:  PRINT *, val+4
 ! DISABLED-NEXT: END PROGRAM
diff --git a/flang/test/Driver/unparse-with-modules.f90 b/flang/test/Driver/unparse-with-modules.f90
index 53997f7804efa..f6444afbe47c1 100644
--- a/flang/test/Driver/unparse-with-modules.f90
+++ b/flang/test/Driver/unparse-with-modules.f90
@@ -25,7 +25,7 @@ program test
 !CHECK:  implicit none
 !CHECK:  real(kind=real32) x
 !CHECK: end module
-!CHECK: program test
+!CHECK: program TEST
 !CHECK:  use :: m1
 !CHECK:  use :: basictestmoduletwo
 !CHECK:  implicit none
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
index 31f2ed022b6c4..8b6e7d67931df 100644
--- a/flang/test/Fir/CUDA/cuda-alloc-free.fir
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -94,4 +94,19 @@ func.func @_QQalloc_char() attributes {fir.bindc_name = "alloc_char"} {
 // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64
 // CHECK: fir.call @_FortranACUFMemAlloc(%[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 
+
+func.func @_QQsetalloc() {
+  %0 = cuf.alloc !fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}> {bindc_name = "d1", data_attr = #cuf.cuda<managed>, uniq_name = "_QFEd1"} -> !fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
+  %1 = fir.coordinate_of %0, a2 : (!fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  cuf.set_allocator_idx %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL:   func.func @_QQsetalloc() {
+// CHECK: %[[DT:.*]] = fir.call @_FortranACUFMemAlloc
+// CHECK: %[[CONV:.*]] = fir.convert %[[DT]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
+// CHECK: %[[COMP:.*]] = fir.coordinate_of %[[CONV]], a2 : (!fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: %[[DESC:.*]] = fir.convert %[[COMP]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFSetAllocatorIndex(%[[DESC]], %c2{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
+
 } // end module
diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90
index 4c634513745fd..35c025dad3000 100644
--- a/flang/test/Fir/CUDA/cuda-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-device-global.f90
@@ -24,3 +24,26 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.conta
 // CHECK: gpu.module @cuda_device_mod
 // CHECK-DAG: fir.global @_QMm2ECc
 // CHECK-DAG: fir.global @_QMm1ECb
+
+// -----
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} {
+  fir.global @_QMmEddarrays {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) {allocator_idx = 3 : i32} : (!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>>
+    fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMmTdevicearrays{phi_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,phi0_i:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_r:!fir.box<!fir.heap<!fir.array<?xf64>>>,buf_i:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>>>
+  }
+  fir.global linkonce_odr @_QMmE.dt.devicearrays constant target : !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}> {
+    %0 = fir.undefined !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>
+    fir.has_value %0 : !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>
+  }
+}
+
+
+// CHECK-NAG: fir.global @_QMmEddarrays
+// CHECK-NAG: fir.global linkonce_odr @_QMmE.dt.devicearrays
+// CHECK: gpu.module @cuda_device_mod
+// CHECK-NAG: fir.global @_QMmEddarrays
+// CHECK-NAG: fir.global linkonce_odr @_QMmE.dt.devicearrays
diff --git a/flang/test/Fir/alloc-32.fir b/flang/test/Fir/alloc-32.fir
index 3eefc3225fac7..a3cbf200c24fc 100644
--- a/flang/test/Fir/alloc-32.fir
+++ b/flang/test/Fir/alloc-32.fir
@@ -20,7 +20,9 @@ func.func @allocmem_scalar_nonchar() -> !fir.heap<i32> {
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
-// CHECK: %[[trunc:.*]] = trunc i64 %[[mul2]] to i32
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[sz:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: %[[trunc:.*]] = trunc i64 %[[sz]] to i32
 // CHECK: call ptr @malloc(i32 %[[trunc]])
 func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
   %1 = fir.allocmem !fir.char<1,?>(%l : i32)
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 5b4930bb9cb34..8da8b828c18b9 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -87,7 +87,9 @@ func.func @alloca_scalar_dynchar_kind(%l : i32) -> !fir.ref<!fir.char<2,?>> {
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]]
-// CHECK: call ptr @malloc(i64 %[[mul2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
   %1 = fir.allocmem !fir.char<1,?>(%l : i32)
   return %1 : !fir.heap<!fir.char<1,?>>
@@ -97,7 +99,9 @@ func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap<!fir.char<1,?>> {
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 2, %[[mul1]]
-// CHECK: call ptr @malloc(i64 %[[mul2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_scalar_dynchar_kind(%l : i32) -> !fir.heap<!fir.char<2,?>>{
   %1 = fir.allocmem !fir.char<2,?>(%l : i32)
   return %1 : !fir.heap<!fir.char<2,?>>
@@ -152,7 +156,9 @@ func.func @allocmem_array_of_char() -> !fir.heap<!fir.array<3x3x!fir.char<1,10>>
 // CHECK-SAME: i32 %[[len:.*]])
 // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[mul2:.*]] = mul i64 9, %[[mul1]]
-// CHECK: call ptr @malloc(i64 %[[mul2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_of_dynchar(%l: i32) -> !fir.heap<!fir.array<3x3x!fir.char<1,?>>> {
   %1 = fir.allocmem !fir.array<3x3x!fir.char<1,?>>(%l : i32)
   return %1 : !fir.heap<!fir.array<3x3x!fir.char<1,?>>>
@@ -180,7 +186,9 @@ func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref<!fir.array<?x?xi32
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_nonchar(
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 12, %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod1]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap<!fir.array<3x?xi32>> {
   %1 = fir.allocmem !fir.array<3x?xi32>, %e
   return %1 : !fir.heap<!fir.array<3x?xi32>>
@@ -190,7 +198,9 @@ func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap<!fir.array<3x?xi
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 4, %[[extent]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_nonchar2(%e: index) -> !fir.heap<!fir.array<?x?xi32>> {
   %1 = fir.allocmem !fir.array<?x?xi32>, %e, %e
   return %1 : !fir.heap<!fir.array<?x?xi32>>
@@ -218,7 +228,9 @@ func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref<!fir.array<?x?x!fir.
 // CHECK-LABEL: define ptr @allocmem_dynarray_of_char(
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 60, %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod1]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod1]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod1]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap<!fir.array<3x?x!fir.char<2,10>>> {
   %1 = fir.allocmem !fir.array<3x?x!fir.char<2,10>>, %e
   return %1 : !fir.heap<!fir.array<3x?x!fir.char<2,10>>>
@@ -228,7 +240,9 @@ func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap<!fir.array<3x?x!fi
 // CHECK-SAME: i64 %[[extent:.*]])
 // CHECK: %[[prod1:.*]] = mul i64 20, %[[extent]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod2]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod2]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_char2(%e : index) -> !fir.heap<!fir.array<?x?x!fir.char<2,10>>> {
   %1 = fir.allocmem !fir.array<?x?x!fir.char<2,10>>, %e, %e
   return %1 : !fir.heap<!fir.array<?x?x!fir.char<2,10>>>
@@ -261,7 +275,9 @@ func.func @alloca_dynarray_of_dynchar2(%l: i32, %e : index) -> !fir.ref<!fir.arr
 // CHECK: %[[prod1:.*]] = sext i32 %[[len]] to i64
 // CHECK: %[[prod2:.*]] = mul i64 6, %[[prod1]]
 // CHECK: %[[prod3:.*]] = mul i64 %[[prod2]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod3]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod3]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod3]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap<!fir.array<3x?x!fir.char<2,?>>> {
   %1 = fir.allocmem !fir.array<3x?x!fir.char<2,?>>(%l : i32), %e
   return %1 : !fir.heap<!fir.array<3x?x!fir.char<2,?>>>
@@ -273,7 +289,9 @@ func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap<!fir.a
 // CHECK: %[[prod1:.*]] = mul i64 2, %[[a]]
 // CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
 // CHECK: %[[prod3:.*]] = mul i64 %[[prod2]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[prod3]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[prod3]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[prod3]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_dynarray_of_dynchar2(%l: i32, %e : index) -> !fir.heap<!fir.array<?x?x!fir.char<2,?>>> {
   %1 = fir.allocmem !fir.array<?x?x!fir.char<2,?>>(%l : i32), %e, %e
   return %1 : !fir.heap<!fir.array<?x?x!fir.char<2,?>>>
@@ -312,7 +330,9 @@ func.func @alloca_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir.r
 // CHECK-SAME: i64 %[[e1:.*]], i64 %[[e2:.*]])
 // CHECK: %[[a:.*]] = mul i64 240, %[[e1]]
 // CHECK: %[[b:.*]] = mul i64 %3, %[[e2]]
-// CHECK: call ptr @malloc(i64 %[[b]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[b]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[b]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.heap<!fir.array<4x?x3x?x5xi32>> {
   %a = fir.allocmem !fir.array<4x?x3x?x5xi32>, %0, %1
   return %a : !fir.heap<!fir.array<4x?x3x?x5xi32>>
@@ -321,7 +341,9 @@ func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.hea
 // CHECK-LABEL: define ptr @allocmem_array_with_holes_char(
 // CHECK-SAME: i64 %[[e:.*]])
 // CHECK: %[[mul:.*]] = mul i64 240, %[[e]]
-// CHECK: call ptr @malloc(i64 %[[mul]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap<!fir.array<3x?x4x!fir.char<2,10>>> {
   %1 = fir.allocmem !fir.array<3x?x4x!fir.char<2,10>>, %e
   return %1 : !fir.heap<!fir.array<3x?x4x!fir.char<2,10>>>
@@ -331,7 +353,9 @@ func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap<!fir.array<3x?
 // CHECK-SAME: i64 %[[len:.*]], i64 %[[extent:.*]])
 // CHECK: %[[a:.*]] = mul i64 24, %[[len]]
 // CHECK: %[[b:.*]] = mul i64 %[[a]], %[[extent]]
-// CHECK: call ptr @malloc(i64 %[[b]])
+// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[b]], 0
+// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[b]], i64 1
+// CHECK: call ptr @malloc(i64 %[[size]])
 func.func @allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir.heap<!fir.array<3x?x4x!fir.char<2,?>>> {
   %1 = fir.allocmem !fir.array<3x?x4x!fir.char<2,?>>(%arg0 : index), %arg1
   return %1 : !fir.heap<!fir.array<3x?x4x!fir.char<2,?>>>
diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir
index 6c7f71f6f1f9c..e8ec8ac79e0c2 100644
--- a/flang/test/Fir/arrexp.fir
+++ b/flang/test/Fir/arrexp.fir
@@ -146,7 +146,9 @@ func.func @f6(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: f32) {
   // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i64 0, i32 1
   // CHECK: %[[EXTENT:.*]] = load i64, ptr %[[EXT_GEP]]
   // CHECK: %[[SIZE:.*]] = mul i64 4, %[[EXTENT]]
-  // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SIZE]])
+  // CHECK: %[[CMP:.*]] = icmp sgt i64 %[[SIZE]], 0
+  // CHECK: %[[SZ:.*]] = select i1 %[[CMP]], i64 %[[SIZE]], i64 1
+  // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SZ]])
   %1 = fir.slice %c2, %c10, %c1 : (index, index, index) -> !fir.slice<1>
   %2 = fir.array_load %arg0 [%1] : (!fir.box<!fir.array<?xf32>>, !fir.slice<1>) -> !fir.array<?xf32>
   %3 = fir.slice %c1, %c9, %c1 : (index, index, index) -> !fir.slice<1>
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 0e2bfe48a807d..50a98466f0d4b 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -216,10 +216,14 @@ func.func @test_alloc_and_freemem_one() {
 }
 
 // CHECK-LABEL:  llvm.func @test_alloc_and_freemem_one() {
-// CHECK:    %[[N:.*]] = llvm.mlir.constant(4 : i64) : i64
-// CHECK-NEXT:    llvm.call @malloc(%[[N]])
-// CHECK:         llvm.call @free(%{{.*}})
-// CHECK-NEXT:    llvm.return
+// CHECK-DAG:    %[[N:.*]] = llvm.mlir.constant(4 : i64) : i64
+// CHECK-DAG:    %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK-DAG:    %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK-NEXT:   %[[CMP:.*]] = llvm.icmp "sgt" %[[N]], %[[ZERO]] : i64
+// CHECK-NEXT:   %[[SZ:.*]] = llvm.select %[[CMP]], %[[N]], %[[ONE]] : i1, i64
+// CHECK-NEXT:   llvm.call @malloc(%[[SZ]])
+// CHECK:        llvm.call @free(%{{.*}})
+// CHECK-NEXT:   llvm.return
 
 // -----
 // Verify that fir.allocmem is transformed to a call to malloc
@@ -233,8 +237,12 @@ func.func @test_alloc_and_freemem_several() {
 }
 
 // CHECK-LABEL:  llvm.func @test_alloc_and_freemem_several() {
-// CHECK:      %[[N:.*]] = llvm.mlir.constant(400 : i64) : i64
-// CHECK: [[MALLOC:%.*]] = llvm.call @malloc(%[[N]])
+// CHECK-DAG: %[[N:.*]] = llvm.mlir.constant(400 : i64) : i64
+// CHECK-DAG: %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK-DAG: %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK-NEXT: %[[CMP:.*]] = llvm.icmp "sgt" %[[N]], %[[ZERO]] : i64
+// CHECK-NEXT: %[[SZ:.*]] = llvm.select %[[CMP]], %[[N]], %[[ONE]] : i1, i64
+// CHECK: [[MALLOC:%.*]] = llvm.call @malloc(%[[SZ]])
 // CHECK:              llvm.call @free([[MALLOC]])
 // CHECK:              llvm.return
 
@@ -250,7 +258,11 @@ func.func @test_with_shape(%ncols: index, %nrows: index) {
 // CHECK:   %[[FOUR:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:   %[[DIM1_SIZE:.*]] = llvm.mul %[[FOUR]], %[[NCOLS]]  : i64
 // CHECK:   %[[TOTAL_SIZE:.*]] = llvm.mul %[[DIM1_SIZE]], %[[NROWS]]  : i64
-// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]])
+// CHECK:   %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:   %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:   %[[CMP:.*]] = llvm.icmp "sgt" %[[TOTAL_SIZE]], %[[ZERO]] : i64
+// CHECK:   %[[SZ:.*]] = llvm.select %[[CMP]], %[[TOTAL_SIZE]], %[[ONE]] : i1, i64
+// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[SZ]])
 // CHECK:   llvm.call @free(%[[MEM]]) : (!llvm.ptr) -> ()
 // CHECK:   llvm.return
 // CHECK: }
@@ -266,7 +278,11 @@ func.func @test_string_with_shape(%len: index, %nelems: index) {
 // CHECK:   %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:   %[[LEN_SIZE:.*]] = llvm.mul %[[ONE]], %[[LEN]]  : i64
 // CHECK:   %[[TOTAL_SIZE:.*]] = llvm.mul %[[LEN_SIZE]], %[[NELEMS]]  : i64
-// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]])
+// CHECK:   %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:   %[[ONEA:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:   %[[CMP:.*]] = llvm.icmp "sgt" %[[TOTAL_SIZE]], %[[ZERO]] : i64
+// CHECK:   %[[SZ:.*]] = llvm.select %[[CMP]], %[[TOTAL_SIZE]], %[[ONEA]] : i1, i64
+// CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[SZ]])
 // CHECK:   llvm.call @free(%[[MEM]]) : (!llvm.ptr) -> ()
 // CHECK:   llvm.return
 // CHECK: }
diff --git a/flang/test/Integration/debug-common-block-1.f90 b/flang/test/Integration/debug-common-block-1.f90
index 18217637be0fa..77f47daea4a91 100644
--- a/flang/test/Integration/debug-common-block-1.f90
+++ b/flang/test/Integration/debug-common-block-1.f90
@@ -89,7 +89,7 @@ program test
 ! CHECK-DAG: ![[CBF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "__BLNK__"{{.*}})
 ! CHECK-DAG: ![[CBAF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "a"{{.*}})
 
-! CHECK-DAG: ![[MAIN:[0-9]+]] = {{.*}}!DISubprogram(name: "test"{{.*}})
+! CHECK-DAG: ![[MAIN:[0-9]+]] = {{.*}}!DISubprogram(name: "TEST"{{.*}})
 ! CHECK-DAG: ![[CBM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "__BLNK__"{{.*}})
 ! CHECK-DAG: ![[CBAM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "a"{{.*}})
 
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index b97be141cc8d0..0ddac633a5b1e 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -37,7 +37,7 @@
 ! BOTH-LABEL: }
 
 program mn
-! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "mn", {{.*}})
+! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "MN", {{.*}})
 
 ! BOTH-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
 ! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
diff --git a/flang/test/Lower/CUDA/cuda-derived.cuf b/flang/test/Lower/CUDA/cuda-derived.cuf
index 96250d88d81c4..d419ee074f7a0 100644
--- a/flang/test/Lower/CUDA/cuda-derived.cuf
+++ b/flang/test/Lower/CUDA/cuda-derived.cuf
@@ -25,6 +25,6 @@ program main
   type(t2) :: b
 end
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"}
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"}
 ! CHECK: %{{.*}} = cuf.alloc !fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>}> {bindc_name = "a", data_attr = #cuf.cuda<managed>, uniq_name = "_QFEa"}
 ! CHECK: %{{.*}} = cuf.alloc !fir.type<_QMm1Tt2{b:!fir.type<_QMm1Tt1{a:!fir.box<!fir.heap<!fir.array<?xf32>>>}>}> {bindc_name = "b", data_attr = #cuf.cuda<managed>, uniq_name = "_QFEb"}
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 42ee7657966e2..d5e614a83b354 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -10,6 +10,7 @@ attributes(global) subroutine devsub()
   integer(4) :: ai
   integer(8) :: al
   integer(8) :: time
+  integer :: smalltime
 
   call syncthreads()
   call syncwarp(1)
@@ -45,7 +46,9 @@ attributes(global) subroutine devsub()
   ai = atomicinc(ai, 1_4)
   ai = atomicdec(ai, 1_4)
 
+  smalltime = clock()
   time = clock64()
+  time = globalTimer()
 end
 
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -82,7 +85,9 @@ end
 ! CHECK: %{{.*}} = llvm.atomicrmw uinc_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 
-! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64()
+! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock : i32
+! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock64 : i64
+! CHECK: %{{.*}} = nvvm.read.ptx.sreg.globaltimer : i64
 
 subroutine host1()
   integer, device :: a(32)
diff --git a/flang/test/Lower/CUDA/cuda-return01.cuf b/flang/test/Lower/CUDA/cuda-return01.cuf
index 47e69a903efd3..ed7c640a71082 100644
--- a/flang/test/Lower/CUDA/cuda-return01.cuf
+++ b/flang/test/Lower/CUDA/cuda-return01.cuf
@@ -28,6 +28,6 @@ program main
   return
 end
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"}
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"}
 ! CHECK: cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK-NOT: cuf.free
diff --git a/flang/test/Lower/CUDA/cuda-return02.cuf b/flang/test/Lower/CUDA/cuda-return02.cuf
index e450d7e796f22..e54818444e49c 100644
--- a/flang/test/Lower/CUDA/cuda-return02.cuf
+++ b/flang/test/Lower/CUDA/cuda-return02.cuf
@@ -13,7 +13,7 @@ program test
   return
 end
 
-! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare
 ! CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2
 ! CHECK-NEXT: ^bb1:
diff --git a/flang/test/Lower/CUDA/cuda-set-allocator.cuf b/flang/test/Lower/CUDA/cuda-set-allocator.cuf
new file mode 100644
index 0000000000000..ee89ea38a3fc7
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-set-allocator.cuf
@@ -0,0 +1,24 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+module m1
+  type ty_device
+    integer, device, allocatable, dimension(:) :: x
+    integer :: y
+    integer, device, allocatable, dimension(:) :: z
+  end type
+contains
+  subroutine sub1()
+    type(ty_device) :: a
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMm1Psub1()
+! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}> {bindc_name = "a", data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub1Ea"} -> !fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
+! CHECK: %[[DT:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda<managed>, uniq_name = "_QMm1Fsub1Ea"} : (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
+! CHECK: fir.address_of(@_QQ_QMm1Tty_device.DerivedInit)
+! CHECK: fir.copy 
+! CHECK: %[[X:.*]] = fir.coordinate_of %[[DT]]#0, x : (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.set_allocator_idx %[[X]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
+! CHECK: %[[Z:.*]] = fir.coordinate_of %[[DT]]#0, z : (!fir.ref<!fir.type<_QMm1Tty_device{x:!fir.box<!fir.heap<!fir.array<?xi32>>>,y:i32,z:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.set_allocator_idx %[[Z]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>}
+
+end module
diff --git a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90 b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
index 07c4f012781d4..cbc56ca1e395b 100644
--- a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
+++ b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90
@@ -24,7 +24,7 @@ program main
   call mvbits(from, 2, 2, to, 0)
   if (any(to /= 5)) STOP 1
 end program
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = "from", uniq_name = "_QFEfrom"}
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90 b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
index 7b64634d10d4b..a097b1522307e 100644
--- a/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
+++ b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
@@ -35,7 +35,7 @@ FUNCTION BAR() RESULT(res)
     END
   END
 
-! CHECK-LABEL:  func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:  func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:    %[[VAL_0:.*]] = fir.alloca !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
 ! CHECK:    %[[VAL_1:.*]] = fir.alloca !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
 ! CHECK:    %[[VAL_2:.*]] = fir.alloca !fir.boxproc<(!fir.ref<i32>) -> i32> {bindc_name = "pp2", uniq_name = "_QFEpp2"}
diff --git a/flang/test/Lower/Intrinsics/cospi.f90 b/flang/test/Lower/Intrinsics/cospi.f90
new file mode 100644
index 0000000000000..894002566141b
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cospi.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes="CHECK"
+
+function test_real4(x)
+  real :: x, test_real4
+  test_real4 = cospi(x)
+end function
+
+! CHECK-LABEL: @_QPtest_real4
+! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64
+! CHECK: %[[factor:.*]] = fir.convert %[[dfactor]] : (f64) -> f32
+! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[factor]] fastmath<contract> : f32
+! CHECK: %[[cos:.*]] = math.cos %[[mul]] fastmath<contract> : f32
+
+function test_real8(x)
+  real(8) :: x, test_real8
+  test_real8 = cospi(x)
+end function
+
+! CHECK-LABEL: @_QPtest_real8
+! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64
+! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[dfactor]] fastmath<contract> : f64
+! CHECK: %[[cos:.*]] = math.cos %[[mul]] fastmath<contract> : f64
diff --git a/flang/test/Lower/OpenACC/acc-atomic-read.f90 b/flang/test/Lower/OpenACC/acc-atomic-read.f90
index 639a98051e3a2..76751a0fa63a8 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-read.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-read.f90
@@ -8,7 +8,7 @@ program acc_atomic_test
      g = h
 end program acc_atomic_test
 
-! CHECK: func @_QQmain() attributes {fir.bindc_name = "acc_atomic_test"} {
+! CHECK: func @_QQmain() attributes {fir.bindc_name = "ACC_ATOMIC_TEST"} {
 ! CHECK: %[[VAR_G:.*]] = fir.alloca f32 {bindc_name = "g", uniq_name = "_QFEg"}
 ! CHECK: %[[G_DECL:.*]]:2 = hlfir.declare %[[VAR_G]] {uniq_name = "_QFEg"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK: %[[VAR_H:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"}
diff --git a/flang/test/Lower/OpenACC/acc-atomic-write.f90 b/flang/test/Lower/OpenACC/acc-atomic-write.f90
index 3c55394021abf..e0116e3281820 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-write.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-write.f90
@@ -2,7 +2,7 @@
 
 ! This test checks the lowering of atomic write
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "acc_atomic_write_test"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "ACC_ATOMIC_WRITE_TEST"} {
 !CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[VAR_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
diff --git a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
index 164eb32a8f684..2de7cc5761a2b 100644
--- a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
+++ b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
@@ -15,15 +15,17 @@ subroutine acc_host_data()
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+ ! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 
   !$acc host_data use_device(a) if_present
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>{{.*}}) {
 ! CHECK: } attributes {ifPresent}
 
   !$acc host_data use_device(a) if(ifCondition)
@@ -33,14 +35,14 @@ subroutine acc_host_data()
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1
-! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.true.)
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.false.)
     a = 1.0
diff --git a/flang/test/Lower/OpenACC/acc-host-data.f90 b/flang/test/Lower/OpenACC/acc-host-data.f90
index 871eabd256ca6..4d09b25b983b9 100644
--- a/flang/test/Lower/OpenACC/acc-host-data.f90
+++ b/flang/test/Lower/OpenACC/acc-host-data.f90
@@ -14,34 +14,37 @@ subroutine acc_host_data()
   !$acc host_data use_device(a)
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 
   !$acc host_data use_device(a) if_present
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK: } attributes {ifPresent}
 
-  !$acc host_data use_device(a) if_present if_present
+  !$acc host_data use_device(a) if_present 
   !$acc end host_data
-! CHECK: acc.host_data dataOperands(%{{.*}} : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.host_data dataOperands(%{{.*}}{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) {
 ! CHECK: } attributes {ifPresent}
 
   !$acc host_data use_device(a) if(ifCondition)
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1
-! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA0]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.true.)
   !$acc end host_data
 
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.false.)
     a = 1.0
diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90
index 789f3a57e1f79..1a63b4120235c 100644
--- a/flang/test/Lower/OpenACC/acc-routine.f90
+++ b/flang/test/Lower/OpenACC/acc-routine.f90
@@ -2,13 +2,14 @@
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine17" [#acc.device_type<default>], "_QPacc_routine16" [#acc.device_type<multicore>])
-! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine16" [#acc.device_type<multicore>])
+! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine17
+! [#acc.device_type<default>], @_QPacc_routine16 [#acc.device_type<multicore>])
+! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine16 [#acc.device_type<multicore>])
 ! CHECK: acc.routine @[[r12:.*]] func(@_QPacc_routine17) worker ([#acc.device_type<host>]) vector ([#acc.device_type<multicore>])
 ! CHECK: acc.routine @[[r11:.*]] func(@_QPacc_routine16) gang([#acc.device_type<nvidia>]) seq ([#acc.device_type<host>])
 ! CHECK: acc.routine @[[r10:.*]] func(@_QPacc_routine11) seq
 ! CHECK: acc.routine @[[r09:.*]] func(@_QPacc_routine10) seq
-! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind("_QPacc_routine9a")
+! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind(@_QPacc_routine9a)
 ! CHECK: acc.routine @[[r07:.*]] func(@_QPacc_routine8) bind("routine8_")
 ! CHECK: acc.routine @[[r06:.*]] func(@_QPacc_routine7) gang(dim: 1 : i64)
 ! CHECK: acc.routine @[[r05:.*]] func(@_QPacc_routine6) nohost
diff --git a/flang/test/Lower/OpenACC/acc-routine03.f90 b/flang/test/Lower/OpenACC/acc-routine03.f90
index 85e4ef580f983..ddd6bda0367e4 100644
--- a/flang/test/Lower/OpenACC/acc-routine03.f90
+++ b/flang/test/Lower/OpenACC/acc-routine03.f90
@@ -30,6 +30,6 @@ subroutine sub2(a)
 end subroutine
 
 ! CHECK: acc.routine @acc_routine_1 func(@_QPsub2) worker nohost
-! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind("_QPsub2") worker
+! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind(@_QPsub2) worker
 ! CHECK: func.func @_QPsub1(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
 ! CHECK: func.func @_QPsub2(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>}
diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90
index f603376163901..655e2762b9694 100644
--- a/flang/test/Lower/OpenACC/acc-routine04.f90
+++ b/flang/test/Lower/OpenACC/acc-routine04.f90
@@ -30,5 +30,5 @@ subroutine sub2()
 ! CHECK: acc.routine @acc_routine_1 func(@_QFPsub2) seq
 ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq
 ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
-! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"}
+! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "TEST_ACC_ROUTINE"}
 ! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>}
diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90
new file mode 100644
index 0000000000000..081a6e317bfc9
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-use-device.f90
@@ -0,0 +1,61 @@
+! This test checks whether the OpenACC use_device clause is applied on both results of hlfir.declare.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! Test for automatic variable appearing in use_device clause.
+subroutine test()
+  integer :: N = 100
+  real*8 :: b(-1:N)
+! CHECK: %[[A0:.*]] = fir.alloca !fir.array<?xf64>, %{{.*}} {bindc_name = "b", uniq_name = "_QFtestEb"}
+! CHECK: %[[A1:.*]] = fir.shape_shift {{.*}} : (index, index) -> !fir.shapeshift<1>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A0]](%[[A1]]) {uniq_name = "_QFtestEb"} : (!fir.ref<!fir.array<?xf64>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
+  
+  !$acc data copy(b)
+! CHECK: %[[B:.*]] = acc.copyin var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
+! CHECK: acc.data dataOperands(%[[B]] : !fir.box<!fir.array<?xf64>>) {
+
+  !$acc host_data use_device(b)
+  call vadd(b)
+  !$acc end host_data
+! CHECK: %[[C:.*]] = acc.use_device var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[D:.*]] = acc.use_device varPtr(%[[A]]#1 : !fir.ref<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>> {name = "b"}
+! CHECK: acc.host_data dataOperands(%[[C]], %[[D]] : !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>) {
+! CHECK: fir.call @_QPvadd(%[[A]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>) -> ()
+  !$acc end data
+! CHECK: acc.copyout accVar(%[[B]] : !fir.box<!fir.array<?xf64>>) to var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) {dataClause = #acc<data_clause acc_copy>, name = "b"}
+end 
+
+! Test for allocatable, pointer and assumed-shape variables appearing in use_device clause.
+subroutine test2(a, b, c)
+  integer :: N = 100
+  real*8, allocatable :: a(:)
+  real*8, target, allocatable :: d(:)
+  real*8 :: b(:)
+  real*8, pointer :: c(:)
+  call allocate(a(N))
+  call allocate(d(N))
+  c => d
+! CHECK: %[[DS:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>)
+
+  !$acc data copy(a,b,c,d)
+  !$acc host_data use_device(a,b,c)
+  call vadd2(a,b,c)
+  !$acc end host_data
+
+! CHECK: %[[H:.*]] = acc.use_device varPtr(%[[E]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"}
+! CHECK: %[[I:.*]] = acc.use_device varPtr(%[[E]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"}
+! CHECK: %[[J:.*]] = acc.use_device var(%[[F]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[K:.*]] = acc.use_device var(%[[F]]#1 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[L:.*]] = acc.use_device varPtr(%[[G]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"}
+! CHECK: %[[M:.*]] = acc.use_device varPtr(%[[G]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"}
+! CHECK: acc.host_data dataOperands(%[[H]], %[[I]], %[[J]], %[[K]], %[[L]], %[[M]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) {
+
+
+
+
+  !$acc end data
+
+end
diff --git a/flang/test/Lower/OpenMP/atomic-read.f90 b/flang/test/Lower/OpenMP/atomic-read.f90
index 68dcaac90eef5..30313e240efa3 100644
--- a/flang/test/Lower/OpenMP/atomic-read.f90
+++ b/flang/test/Lower/OpenMP/atomic-read.f90
@@ -4,7 +4,7 @@
 
 ! This test checks the lowering of atomic read
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomic"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "OMPATOMIC"} {
 !CHECK:    %[[A_REF:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[B_REF:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
diff --git a/flang/test/Lower/OpenMP/atomic-write.f90 b/flang/test/Lower/OpenMP/atomic-write.f90
index 6eded49b0b15d..742fd475c0f04 100644
--- a/flang/test/Lower/OpenMP/atomic-write.f90
+++ b/flang/test/Lower/OpenMP/atomic-write.f90
@@ -4,7 +4,7 @@
 
 ! This test checks the lowering of atomic write
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomicwrite"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "OMPATOMICWRITE"} {
 !CHECK:    %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
 !CHECK:    %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[Y_REF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
diff --git a/flang/test/Lower/OpenMP/common-atomic-lowering.f90 b/flang/test/Lower/OpenMP/common-atomic-lowering.f90
index a53cc101024c6..f729bbb00ac8e 100644
--- a/flang/test/Lower/OpenMP/common-atomic-lowering.f90
+++ b/flang/test/Lower/OpenMP/common-atomic-lowering.f90
@@ -1,6 +1,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "sample"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "SAMPLE"} {
 !CHECK: %[[val_0:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK: %[[val_1:.*]]:2 = hlfir.declare %[[val_0]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[val_2:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
diff --git a/flang/test/Lower/OpenMP/cray-pointers02.f90 b/flang/test/Lower/OpenMP/cray-pointers02.f90
index 19e4cd09fe50a..79d838702e4b0 100644
--- a/flang/test/Lower/OpenMP/cray-pointers02.f90
+++ b/flang/test/Lower/OpenMP/cray-pointers02.f90
@@ -1,7 +1,7 @@
 ! Test lowering of Cray pointee references.
 ! RUN: flang -fc1 -emit-hlfir -fopenmp %s -o - 2>&1 | FileCheck %s
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test_cray_pointers_02"}
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST_CRAY_POINTERS_02"}
 program test_cray_pointers_02
     implicit none
 
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
index c44c6bb966580..af51c4cc3e814 100644
--- a/flang/test/Lower/OpenMP/default-clause-byref.f90
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -34,7 +34,7 @@
 !CHECK:    omp.yield(%[[PRIV_X]] : !fir.ref<i32>)
 !CHECK:  }
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "DEFAULT_CLAUSE_LOWERING"} {
 !CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
 !CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
diff --git a/flang/test/Lower/OpenMP/default-clause.f90 b/flang/test/Lower/OpenMP/default-clause.f90
index ee5f579f06b91..505fa4f0f5d63 100644
--- a/flang/test/Lower/OpenMP/default-clause.f90
+++ b/flang/test/Lower/OpenMP/default-clause.f90
@@ -8,7 +8,7 @@
 ! RUN: | FileCheck %s
 
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "DEFAULT_CLAUSE_LOWERING"} {
 !CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
 !CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
index 4bfd5d8d19261..0036670317a8e 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
@@ -80,7 +80,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
index ec54294c7104f..ea0aa9ec3f53b 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
@@ -68,7 +68,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>>
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3x2xi32>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 2 : index
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
index 488ecc353af8e..eb0df2b3a17de 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -63,7 +63,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
index 596276a99cafc..2caec0384a6ab 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
@@ -18,7 +18,7 @@
 !CHECK:    fir.store %[[CR]] to %[[C0]] : !fir.ref<i32>
 !CHECK:    omp.yield(%[[C0]] : !fir.ref<i32>)
 !CHECK:  }
-!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "MN"} {
 !CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 !CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
index f638688bc2cc1..3c1daa0eb983f 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
@@ -82,7 +82,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
index c06343e997bfd..2be154f4bbaf5 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
@@ -25,7 +25,7 @@ end program main
 ! CHECK:           omp.yield(%[[VAL_2]] : i32)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"}
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/parallel-reduction.f90 b/flang/test/Lower/OpenMP/parallel-reduction.f90
index 612549fb32de5..15e8cc325916d 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction.f90
@@ -10,7 +10,7 @@
 !CHECK:    %[[CR:[_a-z0-9]+]] = arith.addi %[[C0]], %[[C1]] : i32
 !CHECK:    omp.yield(%[[CR]] : i32)
 !CHECK:  }
-!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "MN"} {
 !CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 !CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90
index d11925cafdc12..3d5c0326fb6b9 100644
--- a/flang/test/Lower/OpenMP/sections.f90
+++ b/flang/test/Lower/OpenMP/sections.f90
@@ -5,7 +5,7 @@
 ! RUN: %flang_fc1 -emit-hlfir %openmp_flags %s -o - | FileCheck %s
 ! RUN: bbc -hlfir -emit-hlfir %openmp_flags %s -o - | FileCheck %s
 
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "sample"} {
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "SAMPLE"} {
 !CHECK:   %[[COUNT:.*]] = fir.address_of(@_QFEcount) : !fir.ref<i32>
 !CHECK:   %[[COUNT_DECL:.*]]:2 = hlfir.declare %[[COUNT]] {uniq_name = "_QFEcount"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[ETA:.*]] = fir.alloca f32 {bindc_name = "eta", uniq_name = "_QFEeta"}
diff --git a/flang/test/Lower/OpenMP/taskgroup02.f90 b/flang/test/Lower/OpenMP/taskgroup02.f90
index 1e996a030c23a..4c470b7aa82d1 100644
--- a/flang/test/Lower/OpenMP/taskgroup02.f90
+++ b/flang/test/Lower/OpenMP/taskgroup02.f90
@@ -3,8 +3,9 @@
 ! Check that variables are not privatized twice when TASKGROUP is used.
 
 !CHECK-LABEL: func.func @_QPsub() {
-!CHECK:         omp.parallel {
-!CHECK:           %[[PAR_I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsubEi"}
+!CHECK:         omp.parallel private(@_QFsubEi_private_i32 %[[SUB_I:.*]]#0 -> %[[ARG:.*]] : !fir.ref<i32>)
+!CHECK:           %[[ALLOCA:.*]] = fir.alloca i32
+!CHECK:           %[[PAR_I:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFsubEi"}
 !CHECK:           omp.master {
 !CHECK:             omp.taskgroup {
 !CHECK-NEXT:          omp.task private(@_QFsubEi_firstprivate_i32 %[[PAR_I]]#0 -> %[[TASK_I:.*]] : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
index 5e54cef8c29db..5c90ef7d84f89 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
@@ -3,7 +3,7 @@
 
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 !CHECK:   %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
index 21547b47cf381..0e61261e8853e 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90
@@ -3,7 +3,7 @@
 
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 !CHECK:   %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association.f90
index 7a27efa2f84aa..1887e8aa68fdc 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association.f90
@@ -3,7 +3,7 @@
 
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 !CHECK:   %[[A:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs<internal_assoc>, uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[TP_A:.*]] = omp.threadprivate %[[A_DECL]]#0 : !fir.ref<i32> -> !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/wsloop-chunks.f90
index 29c02a3b3c8d5..f3f11d8c4a6c2 100644
--- a/flang/test/Lower/OpenMP/wsloop-chunks.f90
+++ b/flang/test/Lower/OpenMP/wsloop-chunks.f90
@@ -7,7 +7,7 @@ program wsloop
         integer :: i
         integer :: chunk
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "wsloop"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "WSLOOP"} {
 ! CHECK:         %[[CHUNK_REF:.*]] = fir.alloca i32 {bindc_name = "chunk", uniq_name = "_QFEchunk"}
 ! CHECK:         %[[VAL_0:.*]]:2 = hlfir.declare %[[CHUNK_REF]] {uniq_name = "_QFEchunk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
diff --git a/flang/test/Lower/OpenMP/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/wsloop-collapse.f90
index a4d5cbdc03d3e..7ec40ab4b2f43 100644
--- a/flang/test/Lower/OpenMP/wsloop-collapse.f90
+++ b/flang/test/Lower/OpenMP/wsloop-collapse.f90
@@ -2,7 +2,7 @@
 
 ! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
 
-!CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "wsloop_collapse"} {
+!CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "WSLOOP_COLLAPSE"} {
 program wsloop_collapse
 !CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
index 58b68e5ec4cfd..e2f75bc8e4481 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
@@ -156,7 +156,7 @@ program reduce15
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce15"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE15"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEarr) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}<allocatable>, uniq_name = "_QFEarr"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
index 0a536eb34e7af..663851cba46c6 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
@@ -63,7 +63,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "r", uniq_name = "_QFEr"}
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
index 9f0dd16002baf..2233a74600948 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
@@ -31,5 +31,5 @@ program reduce
 ! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           omp.wsloop {{.*}} reduction(byref @add_reduction_byref_box_2xi32 %{{.*}} -> %{{.*}} : !fir.ref<!fir.box<!fir.array<2xi32>>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
index 5ada623a0ed23..211bde19da8db 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
@@ -40,5 +40,5 @@ subroutine sub(a, lb, ub)
 ! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           omp.wsloop {{.*}} reduction(byref @add_reduction_byref_box_Uxi32 %{{.*}} -> %{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>)
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index 21261da49710c..b7882bcbc0d13 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -65,7 +65,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.array<2xi32>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index ab8dcf1f076c0..7d90335a13a87 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -65,7 +65,7 @@ program reduce
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.array<2xi32>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
index 1e26f5a24d41e..d776bd7cfdd03 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
@@ -28,7 +28,7 @@ program reduce
 ! CHECK:           omp.yield(%[[VAL_2]] : i32)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
index e0a3b469f40c1..5133db0347034 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -93,7 +93,7 @@ program main
 ! CHECK:           omp.yield(%[[VAL_2]] : f64)
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEarray) : !fir.ref<!fir.array<3x3xf64>>
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
index 40b4302f24cd4..27b726376fbeb 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
@@ -64,7 +64,7 @@ program reduce_pointer
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce_pointer"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "REDUCE_POINTER"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "v", uniq_name = "_QFEv"}
diff --git a/flang/test/Lower/amdgcn-complex.f90 b/flang/test/Lower/amdgcn-complex.f90
new file mode 100644
index 0000000000000..f15c7db2b7316
--- /dev/null
+++ b/flang/test/Lower/amdgcn-complex.f90
@@ -0,0 +1,21 @@
+! REQUIRES: amdgpu-registered-target
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+
+subroutine cabsf_test(a, b)
+   complex :: a
+   real :: b
+   b = abs(a)
+end subroutine
+
+! CHECK-LABEL: func @_QPcabsf_test(
+! CHECK: complex.abs
+! CHECK-NOT: fir.call @cabsf
+
+subroutine cexpf_test(a, b)
+   complex :: a, b
+   b = exp(a)
+end subroutine
+
+! CHECK-LABEL: func @_QPcexpf_test(
+! CHECK: complex.exp
+! CHECK-NOT: fir.call @cexpf
diff --git a/flang/test/Lower/array-character.f90 b/flang/test/Lower/array-character.f90
index 1bc73dae44235..e2899d967c80d 100644
--- a/flang/test/Lower/array-character.f90
+++ b/flang/test/Lower/array-character.f90
@@ -32,7 +32,7 @@ program p
   call charlit
 end program p
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 4 : index
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.array<3x!fir.char<1,4>> {bindc_name = "c1", uniq_name = "_QFEc1"}
diff --git a/flang/test/Lower/array-expression-slice-1.f90 b/flang/test/Lower/array-expression-slice-1.f90
index b597814bc0d9f..73943137cb18d 100644
--- a/flang/test/Lower/array-expression-slice-1.f90
+++ b/flang/test/Lower/array-expression-slice-1.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -hlfir=false -fwrapv -o - --outline-intrinsics %s | FileCheck %s
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK-DAG:         %[[VAL_0:.*]] = arith.constant 10 : index
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 2 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant 1 : index
diff --git a/flang/test/Lower/basic-program.f90 b/flang/test/Lower/basic-program.f90
index 5a0e4bdc7b4a1..7e5b40d9e2f0a 100644
--- a/flang/test/Lower/basic-program.f90
+++ b/flang/test/Lower/basic-program.f90
@@ -4,10 +4,10 @@
 program basic
 end program
 
-! CHECK: 1 Program basic
+! CHECK: 1 Program BASIC
 ! CHECK:   1 EndProgramStmt: end program
-! CHECK: End Program basic
+! CHECK: End Program BASIC
 
-! FIR-LABEL: func @_QQmain() attributes {fir.bindc_name = "basic"} {
+! FIR-LABEL: func @_QQmain() attributes {fir.bindc_name = "BASIC"} {
 ! FIR:         return
 ! FIR:       }
diff --git a/flang/test/Lower/big-integer-parameter.f90 b/flang/test/Lower/big-integer-parameter.f90
index a413b1224ebc2..ca90b8adfb318 100644
--- a/flang/test/Lower/big-integer-parameter.f90
+++ b/flang/test/Lower/big-integer-parameter.f90
@@ -13,7 +13,7 @@ program i128
   print*,y
 end
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "i128"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "I128"} {
 ! CHECK-COUNT-2:  %{{.*}} = fir.call @_FortranAioOutputInteger128(%{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<i8>, i128) -> i1
 
 
diff --git a/flang/test/Lower/derived-type-finalization.f90 b/flang/test/Lower/derived-type-finalization.f90
index 3ea58cd719f4a..71cef34899603 100644
--- a/flang/test/Lower/derived-type-finalization.f90
+++ b/flang/test/Lower/derived-type-finalization.f90
@@ -255,5 +255,5 @@ program p
   type(t1) :: t
 end program
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK-NOT: fir.call @_FortranADestroy
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index 1e4bb73350871..d1e12a8dbdfec 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -29,7 +29,9 @@ end program test
 ! CHECK: %[[esval:.*]] = load i64, ptr %[[elesize]]
 ! CHECK: %[[mul:.*]] = mul i64 1, %[[esval]]
 ! CHECK: %[[mul2:.*]] = mul i64 %[[mul]], %[[extval]]
-! CHECK: %[[buff:.*]] = call ptr @malloc(i64 %[[mul2]])
+! CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0
+! CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1
+! CHECK: %[[buff:.*]] = call ptr @malloc(i64 %[[size]])
 ! CHECK: %[[to:.*]] = getelementptr i8, ptr %[[buff]], i64 %
 ! CHECK: call void @llvm.memmove.p0.p0.i64(ptr %[[to]], ptr %{{.*}}, i64 %{{.*}}, i1 false)
 ! CHECK: call void @free(ptr %[[buff]])
diff --git a/flang/test/Lower/io-derived-type.f90 b/flang/test/Lower/io-derived-type.f90
index 7d2fef3faa2b7..7c289ce261678 100644
--- a/flang/test/Lower/io-derived-type.f90
+++ b/flang/test/Lower/io-derived-type.f90
@@ -37,16 +37,16 @@ subroutine test1
     import, all
     ! CHECK:   %[[V_16:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
     ! CHECK:   %[[V_17:[0-9]+]] = fir.convert %[[V_16]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-    ! CHECK:   %[[V_18:[0-9]+]] = fir.address_of(@_QQMmFtest1.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-    ! CHECK:   %[[V_19:[0-9]+]] = fir.convert %[[V_18]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+    ! CHECK:   %[[V_18:[0-9]+]] = fir.address_of(@_QQMmFtest1.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+    ! CHECK:   %[[V_19:[0-9]+]] = fir.convert %[[V_18]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
     ! CHECK:   %[[V_20:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_17]], %[[V_19]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
     print *, 'test1 outer, should call wft: ', t(1)
     block
       import, only: t
       ! CHECK:   %[[V_37:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
       ! CHECK:   %[[V_38:[0-9]+]] = fir.convert %[[V_37]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-      ! CHECK:   %[[V_39:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-      ! CHECK:   %[[V_40:[0-9]+]] = fir.convert %[[V_39]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+      ! CHECK:   %[[V_39:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+      ! CHECK:   %[[V_40:[0-9]+]] = fir.convert %[[V_39]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
       ! CHECK:   %[[V_41:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_38]], %[[V_40]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
       print *, 'test1 block, should not call wft: ', t(2)
     end block
@@ -56,8 +56,8 @@ subroutine test1
   subroutine test2
     ! CHECK:   %[[V_15:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
     ! CHECK:   %[[V_16:[0-9]+]] = fir.convert %[[V_15]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-    ! CHECK:   %[[V_17:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-    ! CHECK:   %[[V_18:[0-9]+]] = fir.convert %[[V_17]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+    ! CHECK:   %[[V_17:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+    ! CHECK:   %[[V_18:[0-9]+]] = fir.convert %[[V_17]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
     ! CHECK:   %[[V_19:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_16]], %[[V_18]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
 
     import, only: t
@@ -74,23 +74,23 @@ subroutine test3(p, x)
 
     ! CHECK:     %[[V_3:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
     ! CHECK:     %[[V_4:[0-9]+]] = fir.convert %[[V_3]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-    ! CHECK:     %[[V_5:[0-9]+]] = fir.alloca !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     %[[V_6:[0-9]+]] = fir.undefined !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
+    ! CHECK:     %[[V_5:[0-9]+]] = fir.alloca !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     %[[V_6:[0-9]+]] = fir.undefined !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
     ! CHECK:     %[[V_7:[0-9]+]] = fir.address_of(@_QMmE.dt.t)
     ! CHECK:     %[[V_8:[0-9]+]] = fir.convert %[[V_7]] : {{.*}} -> !fir.ref<none>
-    ! CHECK:     %[[V_9:[0-9]+]] = fir.insert_value %[[V_6]], %[[V_8]], [0 : index, 0 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
+    ! CHECK:     %[[V_9:[0-9]+]] = fir.insert_value %[[V_6]], %[[V_8]], [0 : index, 0 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
     ! CHECK:     %[[V_10:[0-9]+]] = fir.box_addr %arg0 : (!fir.boxproc<() -> ()>) -> !fir.ref<none>
-    ! CHECK:     %[[V_11:[0-9]+]] = fir.insert_value %[[V_9]], %[[V_10]], [0 : index, 1 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     %[[V_12:[0-9]+]] = fir.insert_value %[[V_11]], %c2{{.*}}, [0 : index, 2 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, i32) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     %[[V_13:[0-9]+]] = fir.insert_value %[[V_12]], %true, [0 : index, 3 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>, i1) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-    ! CHECK:     fir.store %[[V_13]] to %[[V_5]] : !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>
-    ! CHECK:     %[[V_14:[0-9]+]] = fir.alloca tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_15:[0-9]+]] = fir.undefined tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_16:[0-9]+]] = fir.insert_value %[[V_15]], %c1{{.*}}, [0 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>, i64) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_17:[0-9]+]] = fir.insert_value %[[V_16]], %[[V_5]], [1 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     %[[V_18:[0-9]+]] = fir.insert_value %[[V_17]], %true_0, [2 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>, i1) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-    ! CHECK:     fir.store %[[V_18]] to %[[V_14]] : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-    ! CHECK:     %[[V_19:[0-9]+]] = fir.convert %[[V_14]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+    ! CHECK:     %[[V_11:[0-9]+]] = fir.insert_value %[[V_9]], %[[V_10]], [0 : index, 1 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, !fir.ref<none>) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     %[[V_12:[0-9]+]] = fir.insert_value %[[V_11]], %c2{{.*}}, [0 : index, 2 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, i32) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     %[[V_13:[0-9]+]] = fir.insert_value %[[V_12]], %c1_i8, [0 : index, 3 : index] : (!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>, i8) -> !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+    ! CHECK:     fir.store %[[V_13]] to %[[V_5]] : !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>
+    ! CHECK:     %[[V_14:[0-9]+]] = fir.alloca tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_15:[0-9]+]] = fir.undefined tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_16:[0-9]+]] = fir.insert_value %[[V_15]], %c1{{.*}}, [0 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>, i64) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_17:[0-9]+]] = fir.insert_value %[[V_16]], %[[V_5]], [1 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     %[[V_18:[0-9]+]] = fir.insert_value %[[V_17]], %true, [2 : index] : (tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>, i1) -> tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+    ! CHECK:     fir.store %[[V_18]] to %[[V_14]] : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+    ! CHECK:     %[[V_19:[0-9]+]] = fir.convert %[[V_14]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
     ! CHECK:     %[[V_20:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_4]], %[[V_19]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
     print *, x
   end subroutine
@@ -112,8 +112,8 @@ program p
 
   ! CHECK:   %[[V_97:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<!fir.type<_QMmTt{n:i32}>>
   ! CHECK:   %[[V_98:[0-9]+]] = fir.convert %[[V_97]] : (!fir.box<!fir.type<_QMmTt{n:i32}>>) -> !fir.box<none>
-  ! CHECK:   %[[V_99:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:   %[[V_100:[0-9]+]] = fir.convert %[[V_99]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:   %[[V_99:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:   %[[V_100:[0-9]+]] = fir.convert %[[V_99]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:   %[[V_101:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_98]], %[[V_100]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
   print *, 'main, should call wft: ', t(4)
 
@@ -122,14 +122,14 @@ program p
   ! CHECK:   %[[V_35:[0-9]+]] = fir.shape %c2{{.*}} : (index) -> !fir.shape<1>
   ! CHECK:   %[[V_36:[0-9]+]] = fir.embox %[[V_34]](%[[V_35]]) : (!fir.ref<!fir.array<2x!fir.type<_QMmTt{n:i32}>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.type<_QMmTt{n:i32}>>>
   ! CHECK:   %[[V_37:[0-9]+]] = fir.convert %[[V_36]] : (!fir.box<!fir.array<2x!fir.type<_QMmTt{n:i32}>>>) -> !fir.box<none>
-  ! CHECK:   %[[V_38:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:   %[[V_39:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:   %[[V_38:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:   %[[V_39:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:   %[[V_40:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_37]], %[[V_39]]) fastmath<contract> : (!fir.ref<i8>, !fir.box<none>, !fir.ref<none>) -> i1
   print *, y(2:3)
 end
 
-! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-! CHECK: fir.global linkonce @_QQdefault.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
-! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>
-! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>
+! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+! CHECK: fir.global linkonce @_QQdefault.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
+! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>
+! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable constant : tuple<i64, !fir.ref<!fir.array<1xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>
diff --git a/flang/test/Lower/location.f90 b/flang/test/Lower/location.f90
index a6ece31bbebed..95bf2260fc107 100644
--- a/flang/test/Lower/location.f90
+++ b/flang/test/Lower/location.f90
@@ -5,7 +5,7 @@ program test
 
 end 
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: fir.call @_FortranAioOutputAscii(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1 loc(fused<#fir<loc_kind_array[ base,  inclusion,  inclusion]>>["{{.*}}location1.inc":1:10, "{{.*}}location0.inc":1:1, "{{.*}}location.f90":4:1])
 ! CHECK: return loc("{{.*}}location.f90":6:1)
 ! CHECK: } loc("{{.*}}location.f90":3:1)
diff --git a/flang/test/Lower/namelist.f90 b/flang/test/Lower/namelist.f90
index 94b0ef11cb102..770af46eea744 100644
--- a/flang/test/Lower/namelist.f90
+++ b/flang/test/Lower/namelist.f90
@@ -42,8 +42,8 @@ program p
   ! CHECK:     %[[V_42:[0-9]+]] = fir.insert_value %[[V_39]], %[[V_41]], [0 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<i8>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_43:[0-9]+]] = fir.insert_value %[[V_42]], %c2{{.*}}, [1 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, i64) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_44:[0-9]+]] = fir.insert_value %[[V_43]], %[[V_24]], [2 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
-  ! CHECK:     %[[V_45:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:     %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:     %[[V_45:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:     %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:     %[[V_47:[0-9]+]] = fir.insert_value %[[V_44]], %[[V_46]], [3 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<none>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     fir.store %[[V_47]] to %[[V_38]] : !fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>
   ! CHECK:     %[[V_48:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<2xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>) -> !fir.ref<tuple<>>
@@ -100,8 +100,8 @@ subroutine sss
   ! CHECK:     %[[V_20:[0-9]+]] = fir.insert_value %[[V_17]], %[[V_19]], [0 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<i8>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_21:[0-9]+]] = fir.insert_value %[[V_20]], %c1{{.*}}, [1 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, i64) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     %[[V_22:[0-9]+]] = fir.insert_value %[[V_21]], %[[V_8]], [2 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
-  ! CHECK:     %[[V_23:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>
-  ! CHECK:     %[[V_24:[0-9]+]] = fir.convert %[[V_23]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i1>>>, i1>>) -> !fir.ref<none>
+  ! CHECK:     %[[V_23:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>
+  ! CHECK:     %[[V_24:[0-9]+]] = fir.convert %[[V_23]] : (!fir.ref<tuple<i64, !fir.ref<!fir.array<0xtuple<!fir.ref<none>, !fir.ref<none>, i32, i8>>>, i1>>) -> !fir.ref<none>
   ! CHECK:     %[[V_25:[0-9]+]] = fir.insert_value %[[V_22]], %[[V_24]], [3 : index] : (tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>, !fir.ref<none>) -> tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>
   ! CHECK:     fir.store %[[V_25]] to %[[V_16]] : !fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>
   ! CHECK:     %[[V_26:[0-9]+]] = fir.convert %[[V_16]] : (!fir.ref<tuple<!fir.ref<i8>, i64, !fir.ref<!fir.array<1xtuple<!fir.ref<i8>, !fir.ref<!fir.box<none>>>>>, !fir.ref<none>>>) -> !fir.ref<tuple<>>
diff --git a/flang/test/Lower/nested-where.f90 b/flang/test/Lower/nested-where.f90
index ab457280b80ce..28aced2325813 100644
--- a/flang/test/Lower/nested-where.f90
+++ b/flang/test/Lower/nested-where.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "nested_where"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "NESTED_WHERE"} {
 program nested_where
 
   ! CHECK:  %[[VAL_0:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index b7be5f685d9e3..a84b495dd09d0 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -1146,7 +1146,7 @@ program test
   l = i < o%inner
 end program
 
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/pre-fir-tree02.f90 b/flang/test/Lower/pre-fir-tree02.f90
index f4fa626ba6548..65c33e9b364fe 100644
--- a/flang/test/Lower/pre-fir-tree02.f90
+++ b/flang/test/Lower/pre-fir-tree02.f90
@@ -3,7 +3,7 @@
 ! Test Pre-FIR Tree captures all the intended nodes from the parse-tree
 ! Coarray and OpenMP related nodes are tested in other files.
 
-! CHECK: Program test_prog
+! CHECK: Program TEST_PROG
 program test_prog
   ! Check specification part is not part of the tree.
   interface
diff --git a/flang/test/Lower/pre-fir-tree03.f90 b/flang/test/Lower/pre-fir-tree03.f90
index 313dab4d6ec7c..1de66e3f8d016 100644
--- a/flang/test/Lower/pre-fir-tree03.f90
+++ b/flang/test/Lower/pre-fir-tree03.f90
@@ -2,7 +2,7 @@
 
 ! Test Pre-FIR Tree captures OpenMP related constructs
 
-! CHECK: Program test_omp
+! CHECK: Program TEST_OMP
 program test_omp
   ! CHECK: PrintStmt
   print *, "sequential"
diff --git a/flang/test/Lower/pre-fir-tree06.f90 b/flang/test/Lower/pre-fir-tree06.f90
index f84bcd8b58b2d..ed1e76cb375bd 100644
--- a/flang/test/Lower/pre-fir-tree06.f90
+++ b/flang/test/Lower/pre-fir-tree06.f90
@@ -25,13 +25,13 @@ subroutine sub2()
 end
 ! CHECK: End Module m2
 
-! CHECK: Program main
+! CHECK: Program MAIN
 program main
   real :: y
   ! CHECK-NEXT: OpenMPDeclarativeConstruct
   !$omp threadprivate(y)
 end
-! CHECK: End Program main
+! CHECK: End Program MAIN
 
 ! CHECK: Subroutine sub1
 subroutine sub1()
diff --git a/flang/test/Lower/program-units-fir-mangling.f90 b/flang/test/Lower/program-units-fir-mangling.f90
index e0af6f065f34d..65940b4e1ff17 100644
--- a/flang/test/Lower/program-units-fir-mangling.f90
+++ b/flang/test/Lower/program-units-fir-mangling.f90
@@ -124,7 +124,7 @@ subroutine should_not_collide()
 ! CHECK: }
 end subroutine
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "test"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 program test
 ! CHECK: }
 contains
diff --git a/flang/test/Lower/return-statement.f90 b/flang/test/Lower/return-statement.f90
index 6351a6859eb4f..8ab69e3146e2f 100644
--- a/flang/test/Lower/return-statement.f90
+++ b/flang/test/Lower/return-statement.f90
@@ -4,7 +4,7 @@ program basic
   return
 end program
 
-! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "basic"} {
+! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "BASIC"} {
 ! CHECK:         return
 ! CHECK:       }
 
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 2e05b652822b5..d1a844eddd106 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,specialcaseflag:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/volatile-openmp1.f90 b/flang/test/Lower/volatile-openmp1.f90
index 163db953b6b80..07d81a1aeb240 100644
--- a/flang/test/Lower/volatile-openmp1.f90
+++ b/flang/test/Lower/volatile-openmp1.f90
@@ -13,7 +13,7 @@ program main
 !$omp end parallel
 end program
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 1000 : i32
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/volatile-string.f90 b/flang/test/Lower/volatile-string.f90
index 88b21d7b245e9..f263db7abb5fc 100644
--- a/flang/test/Lower/volatile-string.f90
+++ b/flang/test/Lower/volatile-string.f90
@@ -21,7 +21,7 @@ subroutine assign_different_length(string)
   end subroutine
 end program
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 11 : i32
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant true
diff --git a/flang/test/Lower/volatile3.f90 b/flang/test/Lower/volatile3.f90
index 8825f8f3afbcb..a32f29d2bb9e7 100644
--- a/flang/test/Lower/volatile3.f90
+++ b/flang/test/Lower/volatile3.f90
@@ -70,7 +70,7 @@ subroutine sub_select_rank(arr)
 end program
 
 
-! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "p"} {
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "P"} {
 ! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_2:.*]] = arith.constant 10 : index
diff --git a/flang/test/Parser/OpenMP/fail-construct1.f90 b/flang/test/Parser/OpenMP/fail-construct1.f90
index f0ee22125cee0..f0b3f7438ae58 100644
--- a/flang/test/Parser/OpenMP/fail-construct1.f90
+++ b/flang/test/Parser/OpenMP/fail-construct1.f90
@@ -1,5 +1,5 @@
 ! RUN: not %flang_fc1 -fsyntax-only -fopenmp %s 2>&1 | FileCheck %s
 
-! CHECK: error: expected OpenMP construct
 !$omp  parallel
+! CHECK: error: expected '!$OMP '
 end
diff --git a/flang/test/Parser/acc-unparse.f90 b/flang/test/Parser/acc-unparse.f90
index 62e0d4487f3f7..12e6dec19f272 100644
--- a/flang/test/Parser/acc-unparse.f90
+++ b/flang/test/Parser/acc-unparse.f90
@@ -15,7 +15,7 @@ program bug47659
   end do label1
 end program
 
-!CHECK-LABEL: PROGRAM bug47659
+!CHECK-LABEL: PROGRAM BUG47659
 !CHECK: !$ACC PARALLEL LOOP
 
 
diff --git a/flang/test/Preprocessing/omp-sentinel-fixed-form.F b/flang/test/Preprocessing/omp-sentinel-fixed-form.F
new file mode 100644
index 0000000000000..c8271682a5b92
--- /dev/null
+++ b/flang/test/Preprocessing/omp-sentinel-fixed-form.F
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=50 %s | FileCheck %s
+
+#define OMP_TARGET        .true.
+#define OMP_SIMD    .false.
+      program test
+      implicit none
+      integer     i,j,n
+      n = 100
+! CHECK: !$OMP METADIRECTIVE  WHEN(USER={CONDITION(.true._4)}: TARGET TEAMS DISTRIBUTE PARALLEL&
+! CHECK: !$OMP& DO) DEFAULT(TARGET TEAMS LOOP)
+!$omp  metadirective
+!$omp& when(user={condition(OMP_TARGET.or.OMP_SIMD)}:
+!$omp&      target teams distribute parallel do )
+!$omp& default(target teams loop)
+      do i=0,n
+        do j=0,n
+          write(*,*) "Test"
+        enddo
+      enddo
+      return
+      end program
diff --git a/flang/test/Semantics/OpenACC/acc-symbols01.f90 b/flang/test/Semantics/OpenACC/acc-symbols01.f90
index 375445bad13a5..51a7a3a23e8ce 100644
--- a/flang/test/Semantics/OpenACC/acc-symbols01.f90
+++ b/flang/test/Semantics/OpenACC/acc-symbols01.f90
@@ -1,24 +1,24 @@
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenacc
 
-!DEF: /mm MainProgram
-program mm
-  !DEF: /mm/x ObjectEntity REAL(4)
-  !DEF: /mm/y ObjectEntity REAL(4)
+!DEF: /MM MainProgram
+program MM
+  !DEF: /MM/x ObjectEntity REAL(4)
+  !DEF: /MM/y ObjectEntity REAL(4)
   real x, y
-  !DEF: /mm/a ObjectEntity INTEGER(4)
-  !DEF: /mm/b ObjectEntity INTEGER(4)
-  !DEF: /mm/c ObjectEntity INTEGER(4)
-  !DEF: /mm/i ObjectEntity INTEGER(4)
+  !DEF: /MM/a ObjectEntity INTEGER(4)
+  !DEF: /MM/b ObjectEntity INTEGER(4)
+  !DEF: /MM/c ObjectEntity INTEGER(4)
+  !DEF: /MM/i ObjectEntity INTEGER(4)
   integer a(10), b(10), c(10), i
-  !REF: /mm/b
+  !REF: /MM/b
   b = 2
  !$acc parallel present(c) firstprivate(b) private(a)
  !$acc loop
-  !REF: /mm/i
+  !REF: /MM/i
   do i=1,10
-   !REF: /mm/a
-   !REF: /mm/i
-   !REF: /mm/b
+   !REF: /MM/a
+   !REF: /MM/i
+   !REF: /MM/b
    a(i) = b(i)
   end do
  !$acc end parallel
diff --git a/flang/test/Semantics/OpenMP/critical_within_default.f90 b/flang/test/Semantics/OpenMP/critical_within_default.f90
index dd972e6e52949..a5fe30eeb7de0 100644
--- a/flang/test/Semantics/OpenMP/critical_within_default.f90
+++ b/flang/test/Semantics/OpenMP/critical_within_default.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 ! Test that we do not make a private copy of the critical name
 
-!CHECK:  MainProgram scope: mn
+!CHECK:  MainProgram scope: MN
 !CHECK-NEXT:    j size=4 offset=0: ObjectEntity type: INTEGER(4)
 !CHECK-NEXT:    OtherConstruct scope:
 !CHECK-NEXT:      j (OmpPrivate): HostAssoc
diff --git a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
index 06f41ab8ce76f..e57a5c0c1cea6 100644
--- a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
+++ b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
 
 program main
-!CHECK-LABEL: MainProgram scope: main
+!CHECK-LABEL: MainProgram scope: MAIN
    implicit none
 
    type ty
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90 b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
index 9d0a097fb1991..fc977f2f1b839 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90
@@ -17,7 +17,7 @@ end function mymax
 end module mymod
 
 program omp_examples
-!CHECK-LABEL: MainProgram scope: omp_examples
+!CHECK-LABEL: MainProgram scope: OMP_EXAMPLES
   use mymod
   implicit none
   integer, parameter :: n = 100
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
index d7a9f2fc0a36b..84dbe1af01877 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90
@@ -49,7 +49,7 @@ function my_add(x, y)
 end module m1
 
 program test_vector
-!CHECK-LABEL: MainProgram scope: test_vector
+!CHECK-LABEL: MainProgram scope: TEST_VECTOR
   use vector_mod
 !CHECK: add_vectors (Function): Use from add_vectors in vector_mod
   implicit none
diff --git a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
index 12e80cbf7b327..9cd638d796091 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90
@@ -22,7 +22,7 @@ end function my_mul
 end module module1
 
 program test_omp_reduction
-!CHECK: MainProgram scope: test_omp_reduction
+!CHECK: MainProgram scope: TEST_OMP_REDUCTION
   use module1, only: t1, operator(.modmul.) => operator(.mul.)
 
 !CHECK: .modmul. (Function): Use from .mul. in module1
diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90
index ddca38fd57812..1f39c57c54ad1 100644
--- a/flang/test/Semantics/OpenMP/declare-reduction.f90
+++ b/flang/test/Semantics/OpenMP/declare-reduction.f90
@@ -31,7 +31,7 @@ end subroutine initme
 end function func
 
 program main
-!CHECK-LABEL: MainProgram scope: main
+!CHECK-LABEL: MainProgram scope: MAIN
 
   !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0)
 
diff --git a/flang/test/Semantics/OpenMP/declare-target03.f90 b/flang/test/Semantics/OpenMP/declare-target03.f90
index 64a299d78224a..48cfc68393873 100644
--- a/flang/test/Semantics/OpenMP/declare-target03.f90
+++ b/flang/test/Semantics/OpenMP/declare-target03.f90
@@ -13,10 +13,10 @@ subroutine bar
 program main
   use mod1
 
-  !ERROR: The module name or main program name cannot be in a DECLARE TARGET directive
+  !ERROR: The module name cannot be in a DECLARE TARGET directive
   !$omp declare target (mod1)
 
-  !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
-  !ERROR: The module name or main program name cannot be in a DECLARE TARGET directive
+  ! This is now allowed: "main" is implicitly declared symbol separate
+  ! from the main program symbol
   !$omp declare target (main)
 end
diff --git a/flang/test/Semantics/OpenMP/do-schedule03.f90 b/flang/test/Semantics/OpenMP/do-schedule03.f90
index 8787b094d581a..05602ca57e4a9 100644
--- a/flang/test/Semantics/OpenMP/do-schedule03.f90
+++ b/flang/test/Semantics/OpenMP/do-schedule03.f90
@@ -2,27 +2,27 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Schedule Clause
 ! Test that does not catch non constant integer expressions like xx - xx.
-  !DEF: /ompdoschedule MainProgram
-program ompdoschedule
-  !DEF: /ompdoschedule/a ObjectEntity REAL(4)
-  !DEF: /ompdoschedule/y ObjectEntity REAL(4)
-  !DEF: /ompdoschedule/z ObjectEntity REAL(4)
+  !DEF: /OMPDOSCHEDULE MainProgram
+program OMPDOSCHEDULE
+  !DEF: /OMPDOSCHEDULE/a ObjectEntity REAL(4)
+  !DEF: /OMPDOSCHEDULE/y ObjectEntity REAL(4)
+  !DEF: /OMPDOSCHEDULE/z ObjectEntity REAL(4)
   real  a(100),y(100),z(100)
-  !DEF: /ompdoschedule/b ObjectEntity INTEGER(4)
-  !DEF: /ompdoschedule/i ObjectEntity INTEGER(4)
-  !DEF: /ompdoschedule/n ObjectEntity INTEGER(4)
+  !DEF: /OMPDOSCHEDULE/b ObjectEntity INTEGER(4)
+  !DEF: /OMPDOSCHEDULE/i ObjectEntity INTEGER(4)
+  !DEF: /OMPDOSCHEDULE/n ObjectEntity INTEGER(4)
   integer  b,i,n
-  !REF: /ompdoschedule/b
+  !REF: /OMPDOSCHEDULE/b
   b = 10
   !$omp do  schedule(static,b-b)
-  !DEF: /ompdoschedule/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !REF: /ompdoschedule/n
+  !DEF: /OMPDOSCHEDULE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !REF: /OMPDOSCHEDULE/n
   do i = 2,n+1
-    !REF: /ompdoschedule/y
-    !REF: /ompdoschedule/OtherConstruct1/i
-    !REF: /ompdoschedule/z
-    !REF: /ompdoschedule/a
+    !REF: /OMPDOSCHEDULE/y
+    !REF: /OMPDOSCHEDULE/OtherConstruct1/i
+    !REF: /OMPDOSCHEDULE/z
+    !REF: /OMPDOSCHEDULE/a
     y(i) = z(i-1) + a(i)
   end do
   !$omp end do
-end program ompdoschedule
+end program OMPDOSCHEDULE
diff --git a/flang/test/Semantics/OpenMP/do01-positivecase.f90 b/flang/test/Semantics/OpenMP/do01-positivecase.f90
index 905fdbaf18476..50a6870f43896 100644
--- a/flang/test/Semantics/OpenMP/do01-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do01-positivecase.f90
@@ -4,16 +4,16 @@
 ! The loop iteration variable may not appear in a firstprivate directive.
 ! A positive case
 
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
   integer i
 
   !$omp do  firstprivate(k)
-  !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
     print *, "Hello"
   end do
   !$omp end do
 
-end program omp_do
+end program OMP_DO
diff --git a/flang/test/Semantics/OpenMP/do04-positivecase.f90 b/flang/test/Semantics/OpenMP/do04-positivecase.f90
index eb2d67bb8ceb2..51b69fce3c7cc 100644
--- a/flang/test/Semantics/OpenMP/do04-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do04-positivecase.f90
@@ -2,21 +2,21 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop Constructs
 
-!DEF: /omp_do1 MainProgram
-program omp_do1
-  !DEF: /omp_do1/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do1/j ObjectEntity INTEGER(4)
-  !DEF: /omp_do1/k (OmpThreadprivate) ObjectEntity INTEGER(4)
-  !DEF: /omp_do1/n (OmpThreadprivate) ObjectEntity INTEGER(4)
+!DEF: /OMP_DO1 MainProgram
+program OMP_DO1
+  !DEF: /OMP_DO1/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO1/j ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO1/k (OmpThreadprivate) ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO1/n (OmpThreadprivate) ObjectEntity INTEGER(4)
   integer i, j, k, n
   !$omp threadprivate (k,n)
   !$omp do
-  !DEF: /omp_do1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_do1/j
+    !REF: /OMP_DO1/j
     do j=1,10
       print *, "Hello"
     end do
   end do
   !$omp end do
-end program omp_do1
+end program OMP_DO1
diff --git a/flang/test/Semantics/OpenMP/do05-positivecase.f90 b/flang/test/Semantics/OpenMP/do05-positivecase.f90
index eda04610535c2..d4eb1fd6bc3da 100644
--- a/flang/test/Semantics/OpenMP/do05-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do05-positivecase.f90
@@ -3,13 +3,13 @@
 ! 2.7.1 Loop Construct restrictions on single directive.
 ! A positive case
 
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do/n ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/n ObjectEntity INTEGER(4)
   integer i,n
   !$omp parallel
-  !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
     !$omp single
     print *, "hello"
@@ -19,13 +19,13 @@ program omp_do
 
   !$omp parallel  default(shared)
   !$omp do
-  !DEF: /omp_do/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !DEF: /omp_do/OtherConstruct2/OtherConstruct1/n HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/n HostAssoc INTEGER(4)
   do i=1,n
     !$omp parallel
     !$omp single
     !DEF: /work EXTERNAL (Subroutine) ProcEntity
-    !DEF: /omp_do/OtherConstruct2/OtherConstruct1/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
+    !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
     call work(i, 1)
     !$omp end single
     !$omp end parallel
@@ -34,7 +34,7 @@ program omp_do
   !$omp end parallel
 
   !$omp parallel private(i)
-  !DEF: /omp_do/OtherConstruct3/i (OmpPrivate, OmpExplicit) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct3/i (OmpPrivate, OmpExplicit) HostAssoc INTEGER(4)
   do i=1,10
      !$omp single
      print *, "hello"
@@ -43,32 +43,32 @@ program omp_do
   !$omp end parallel
 
   !$omp target teams distribute parallel do
-  !DEF:/omp_do/OtherConstruct4/i (OmpPrivate ,OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF:/OMP_DO/OtherConstruct4/i (OmpPrivate ,OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF:/omp_do/OtherConstruct4/i
+    !REF:/OMP_DO/OtherConstruct4/i
     if(i<10) cycle
   end do
   !$omp end target teams distribute parallel do
 
   !$omp target teams distribute parallel do simd
-  !DEF:/omp_do/OtherConstruct5/i (OmpLinear,OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF:/OMP_DO/OtherConstruct5/i (OmpLinear,OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF:/omp_do/OtherConstruct5/i
+    !REF:/OMP_DO/OtherConstruct5/i
     if(i<10) cycle
   end do
   !$omp end target teams distribute parallel do simd
 
   !$omp target teams distribute 
-  !DEF: /omp_do/OtherConstruct6/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct6/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF: /omp_do/OtherConstruct6/i
+    !REF: /OMP_DO/OtherConstruct6/i
     if(i < 5) cycle
   end do
 
   !$omp target teams distribute simd
-  !DEF: /omp_do/OtherConstruct7/i (OmpLinear, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct7/i (OmpLinear, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,100
-    !REF: /omp_do/OtherConstruct7/i
+    !REF: /OMP_DO/OtherConstruct7/i
     if(i < 5) cycle
   end do
-end program omp_do
+end program OMP_DO
diff --git a/flang/test/Semantics/OpenMP/do06-positivecases.f90 b/flang/test/Semantics/OpenMP/do06-positivecases.f90
index 2713b55fa2ecb..dfb1d999bbc53 100644
--- a/flang/test/Semantics/OpenMP/do06-positivecases.f90
+++ b/flang/test/Semantics/OpenMP/do06-positivecases.f90
@@ -5,14 +5,14 @@
 ! region ever binds to a loop region arising from the loop construct.
 
 ! A positive case
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do/j ObjectEntity INTEGER(4)
-  !DEF: /omp_do/k ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/j ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/k ObjectEntity INTEGER(4)
   integer i, j, k
   !$omp do  ordered
-    !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do i=1,10
       !$omp ordered
       !DEF: /my_func EXTERNAL (Subroutine) ProcEntity
@@ -20,4 +20,4 @@ program omp_do
       !$omp end ordered
     end do
   !$omp end do
-end program omp_do
+end program OMP_DO
diff --git a/flang/test/Semantics/OpenMP/do11.f90 b/flang/test/Semantics/OpenMP/do11.f90
index faab457efff3c..472048d684276 100644
--- a/flang/test/Semantics/OpenMP/do11.f90
+++ b/flang/test/Semantics/OpenMP/do11.f90
@@ -2,24 +2,24 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop Constructs
 
-!DEF: /omp_do MainProgram
-program omp_do
-  !DEF: /omp_do/i ObjectEntity INTEGER(4)
-  !DEF: /omp_do/j ObjectEntity INTEGER(4)
-  !DEF: /omp_do/k ObjectEntity INTEGER(4)
+!DEF: /OMP_DO MainProgram
+program OMP_DO
+  !DEF: /OMP_DO/i ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/j ObjectEntity INTEGER(4)
+  !DEF: /OMP_DO/k ObjectEntity INTEGER(4)
   integer i, j, k
   !$omp do
-  !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_do/j
+    !REF: /OMP_DO/j
     do j=1,10
-      !REF: /omp_do/OtherConstruct1/i
-      !REF: /omp_do/j
+      !REF: /OMP_DO/OtherConstruct1/i
+      !REF: /OMP_DO/j
       print *, "it", i, j
     end do
   end do
   !$omp end do
-end program omp_do
+end program OMP_DO
 
 !DEF: /omp_do2 (Subroutine)Subprogram
 subroutine omp_do2
diff --git a/flang/test/Semantics/OpenMP/do12.f90 b/flang/test/Semantics/OpenMP/do12.f90
index a057a246f7a99..06055b7572a60 100644
--- a/flang/test/Semantics/OpenMP/do12.f90
+++ b/flang/test/Semantics/OpenMP/do12.f90
@@ -2,20 +2,20 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop constructs.
 
-!DEF: /omp_cycle MainProgram
-program omp_cycle
+!DEF: /OMP_CYCLE MainProgram
+program OMP_CYCLE
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !REF: /omp_cycle/OtherConstruct1/i
+    !REF: /OMP_CYCLE/OtherConstruct1/i
     if (i<1) cycle
-    !DEF: /omp_cycle/j (Implicit) ObjectEntity INTEGER(4)
+    !DEF: /OMP_CYCLE/j (Implicit) ObjectEntity INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/k (Implicit) ObjectEntity INTEGER(4)
+      !DEF: /OMP_CYCLE/k (Implicit) ObjectEntity INTEGER(4)
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct1/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct1/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -23,17 +23,17 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !REF: /omp_cycle/j
+    !REF: /OMP_CYCLE/j
     do j=0,10
-      !REF: /omp_cycle/OtherConstruct2/i
+      !REF: /OMP_CYCLE/OtherConstruct2/i
       if (i<1) cycle
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct2/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct2/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -41,17 +41,17 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(2)
-  !DEF: /omp_cycle/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct3/i
+        !REF: /OMP_CYCLE/OtherConstruct3/i
         if (i<1) cycle
-        !REF: /omp_cycle/OtherConstruct3/i
-        !REF: /omp_cycle/OtherConstruct3/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct3/i
+        !REF: /OMP_CYCLE/OtherConstruct3/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -59,17 +59,17 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(3)
-  !DEF: /omp_cycle/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct4/i
+        !REF: /OMP_CYCLE/OtherConstruct4/i
         if (i<1) cycle
-        !REF: /omp_cycle/OtherConstruct4/i
-        !REF: /omp_cycle/OtherConstruct4/j
-        !REF: /omp_cycle/OtherConstruct4/k
+        !REF: /OMP_CYCLE/OtherConstruct4/i
+        !REF: /OMP_CYCLE/OtherConstruct4/j
+        !REF: /OMP_CYCLE/OtherConstruct4/k
         print *, i, j, k
       end do
     end do
@@ -77,20 +77,20 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(3)
-  !DEF: /omp_cycle/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo:do i=0,10
-    !DEF: /omp_cycle/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     foo1:do j=0,10
-      !DEF: /omp_cycle/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       foo2:do k=0,10
-        !REF: /omp_cycle/OtherConstruct5/i
+        !REF: /OMP_CYCLE/OtherConstruct5/i
         if (i<1) cycle foo2
-        !REF: /omp_cycle/OtherConstruct5/i
-        !REF: /omp_cycle/OtherConstruct5/j
-        !REF: /omp_cycle/OtherConstruct5/k
+        !REF: /OMP_CYCLE/OtherConstruct5/i
+        !REF: /OMP_CYCLE/OtherConstruct5/j
+        !REF: /OMP_CYCLE/OtherConstruct5/k
         print *, i, j, k
       end do foo2
     end do foo1
   end do foo
   !$omp end do
-end program omp_cycle
+end program OMP_CYCLE
diff --git a/flang/test/Semantics/OpenMP/do14.f90 b/flang/test/Semantics/OpenMP/do14.f90
index 5e8a5a64c2979..e17647394fff7 100644
--- a/flang/test/Semantics/OpenMP/do14.f90
+++ b/flang/test/Semantics/OpenMP/do14.f90
@@ -2,19 +2,19 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop constructs.
 
-!DEF: /omp_cycle MainProgram
-program omp_cycle
+!DEF: /OMP_CYCLE MainProgram
+program OMP_CYCLE
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
     cycle
-    !DEF: /omp_cycle/j (Implicit) ObjectEntity INTEGER(4)
+    !DEF: /OMP_CYCLE/j (Implicit) ObjectEntity INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/k (Implicit) ObjectEntity INTEGER(4)
+      !DEF: /OMP_CYCLE/k (Implicit) ObjectEntity INTEGER(4)
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct1/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct1/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -22,16 +22,16 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(1)
-  !DEF: /omp_cycle/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !REF: /omp_cycle/j
+    !REF: /OMP_CYCLE/j
     do j=0,10
       cycle
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
-        !REF: /omp_cycle/OtherConstruct2/i
-        !REF: /omp_cycle/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct2/i
+        !REF: /OMP_CYCLE/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -39,16 +39,16 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(2)
-  !DEF: /omp_cycle/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !REF: /omp_cycle/k
+      !REF: /OMP_CYCLE/k
       do k=0,10
         cycle
-        !REF: /omp_cycle/OtherConstruct3/i
-        !REF: /omp_cycle/OtherConstruct3/j
-        !REF: /omp_cycle/k
+        !REF: /OMP_CYCLE/OtherConstruct3/i
+        !REF: /OMP_CYCLE/OtherConstruct3/j
+        !REF: /OMP_CYCLE/k
         print *, i, j, k
       end do
     end do
@@ -56,16 +56,16 @@ program omp_cycle
   !$omp end do
 
   !$omp do  collapse(3)
-  !DEF: /omp_cycle/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=0,10
-    !DEF: /omp_cycle/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=0,10
-      !DEF: /omp_cycle/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       do k=0,10
         cycle
-        !REF: /omp_cycle/OtherConstruct4/i
-        !REF: /omp_cycle/OtherConstruct4/j
-        !REF: /omp_cycle/OtherConstruct4/k
+        !REF: /OMP_CYCLE/OtherConstruct4/i
+        !REF: /OMP_CYCLE/OtherConstruct4/j
+        !REF: /OMP_CYCLE/OtherConstruct4/k
         print *, i, j, k
       end do
     end do
@@ -73,19 +73,19 @@ program omp_cycle
   !$omp end do
 
   !$omp do  ordered(3)
-  !DEF: /omp_cycle/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_CYCLE/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo:do i=0,10
-    !DEF: /omp_cycle/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+    !DEF: /OMP_CYCLE/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     foo1:do j=0,10
-      !DEF: /omp_cycle/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      !DEF: /OMP_CYCLE/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
       foo2:do k=0,10
         cycle foo2
-        !REF: /omp_cycle/OtherConstruct5/i
-        !REF: /omp_cycle/OtherConstruct5/j
-        !REF: /omp_cycle/OtherConstruct5/k
+        !REF: /OMP_CYCLE/OtherConstruct5/i
+        !REF: /OMP_CYCLE/OtherConstruct5/j
+        !REF: /OMP_CYCLE/OtherConstruct5/k
         print *, i, j, k
       end do foo2
     end do foo1
   end do foo
   !$omp end do
-end program omp_cycle
+end program OMP_CYCLE
diff --git a/flang/test/Semantics/OpenMP/do17.f90 b/flang/test/Semantics/OpenMP/do17.f90
index c0c59f16dee1b..cac11f215f074 100644
--- a/flang/test/Semantics/OpenMP/do17.f90
+++ b/flang/test/Semantics/OpenMP/do17.f90
@@ -2,56 +2,56 @@
 ! OpenMP Version 4.5
 ! 2.7.1 Do Loop constructs.
 
-!DEF: /test MainProgram
-program test
- !DEF: /test/i ObjectEntity INTEGER(4)
- !DEF: /test/j ObjectEntity INTEGER(4)
- !DEF: /test/k ObjectEntity INTEGER(4)
+!DEF: /TEST MainProgram
+program TEST
+ !DEF: /TEST/i ObjectEntity INTEGER(4)
+ !DEF: /TEST/j ObjectEntity INTEGER(4)
+ !DEF: /TEST/k ObjectEntity INTEGER(4)
  integer i, j, k
  !$omp do  collapse(2)
- !DEF: /test/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+ !DEF: /TEST/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  foo: do i=0,10
-  !DEF: /test/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /TEST/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo1: do j=0,10
-   !REF: /test/k
+   !REF: /TEST/k
    foo2: do k=0,10
-    !REF: /test/OtherConstruct1/i
+    !REF: /TEST/OtherConstruct1/i
     select case (i)
     case (5)
      cycle foo1
     case (7)
      cycle foo2
     end select
-    !REF: /test/OtherConstruct1/i
-    !REF: /test/OtherConstruct1/j
-    !REF: /test/k
+    !REF: /TEST/OtherConstruct1/i
+    !REF: /TEST/OtherConstruct1/j
+    !REF: /TEST/k
     print *, i, j, k
    end do foo2
   end do foo1
  end do foo
 
  !$omp do  collapse(2)
- !DEF: /test/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+ !DEF: /TEST/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  foo: do i=0,10
-  !DEF: /test/OtherConstruct2/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /TEST/OtherConstruct2/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   foo1: do j=0,10
-   !REF: /test/k
+   !REF: /TEST/k
    foo2: do k=0,10
-    !REF: /test/OtherConstruct2/i
+    !REF: /TEST/OtherConstruct2/i
     if (i<3) then
      cycle foo1
-     !REF: /test/OtherConstruct2/i
+     !REF: /TEST/OtherConstruct2/i
     else if (i>8) then
      cycle foo1
     else
      cycle foo2
     end if
-    !REF: /test/OtherConstruct2/i
-    !REF: /test/OtherConstruct2/j
-    !REF: /test/k
+    !REF: /TEST/OtherConstruct2/i
+    !REF: /TEST/OtherConstruct2/j
+    !REF: /TEST/k
     print *, i, j, k
    end do foo2
   end do foo1
  end do foo
 !$omp end do
-end program test
+end program TEST
diff --git a/flang/test/Semantics/OpenMP/map-clause-symbols.f90 b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
index 8f984fcd2fa7e..1d6315b4a2312 100644
--- a/flang/test/Semantics/OpenMP/map-clause-symbols.f90
+++ b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
@@ -1,6 +1,6 @@
 ! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
 program main
-!CHECK-LABEL:  MainProgram scope: main
+!CHECK-LABEL:  MainProgram scope: MAIN
   integer, parameter :: n = 256
   real(8) :: a(256)
   !$omp target map(mapper(xx), from:a)
diff --git a/flang/test/Semantics/OpenMP/reduction08.f90 b/flang/test/Semantics/OpenMP/reduction08.f90
index 01a06eb7d7414..b4a81e644c1e7 100644
--- a/flang/test/Semantics/OpenMP/reduction08.f90
+++ b/flang/test/Semantics/OpenMP/reduction08.f90
@@ -2,62 +2,62 @@
 ! OpenMP Version 4.5
 ! 2.15.3.6 Reduction Clause Positive cases
 
-!DEF: /omp_reduction MainProgram
-program omp_reduction
-  !DEF: /omp_reduction/i ObjectEntity INTEGER(4)
+!DEF: /OMP_REDUCTION MainProgram
+program OMP_REDUCTION
+  !DEF: /OMP_REDUCTION/i ObjectEntity INTEGER(4)
   integer i
-  !DEF: /omp_reduction/k ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/k ObjectEntity INTEGER(4)
   integer :: k = 10
-  !DEF: /omp_reduction/m ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/m ObjectEntity INTEGER(4)
   integer :: m = 12
   !$omp parallel do  reduction(max:k)
-  !DEF: /omp_reduction/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct1/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct1/m (OmpShared) HostAssoc INTEGER(4)
     k = max(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(min:k)
-  !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct2/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct2/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct2/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct2/m (OmpShared) HostAssoc INTEGER(4)
     k = min(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(iand:k)
-  !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct3/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct3/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct3/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct3/m (OmpShared) HostAssoc INTEGER(4)
     k = iand(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(ior:k)
-  !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct4/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct4/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct4/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct4/m (OmpShared) HostAssoc INTEGER(4)
     k = ior(k, m)
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(ieor:k)
-  !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct5/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
-    !DEF: /omp_reduction/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !DEF: /omp_reduction/OtherConstruct5/m (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct5/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+    !DEF: /OMP_REDUCTION/OtherConstruct5/m (OmpShared) HostAssoc INTEGER(4)
     k = ieor(k,m)
   end do
   !$omp end parallel do
 
-end program omp_reduction
+end program OMP_REDUCTION
diff --git a/flang/test/Semantics/OpenMP/reduction09.f90 b/flang/test/Semantics/OpenMP/reduction09.f90
index d6c71c30d2834..ca60805e8c416 100644
--- a/flang/test/Semantics/OpenMP/reduction09.f90
+++ b/flang/test/Semantics/OpenMP/reduction09.f90
@@ -1,22 +1,22 @@
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.3.6 Reduction Clause Positive cases.
-!DEF: /omp_reduction MainProgram
-program omp_reduction
-  !DEF: /omp_reduction/i ObjectEntity INTEGER(4)
+!DEF: /OMP_REDUCTION MainProgram
+program OMP_REDUCTION
+  !DEF: /OMP_REDUCTION/i ObjectEntity INTEGER(4)
   integer i
-  !DEF: /omp_reduction/k ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/k ObjectEntity INTEGER(4)
   integer :: k = 10
-  !DEF: /omp_reduction/a ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/a ObjectEntity INTEGER(4)
   integer a(10)
-  !DEF: /omp_reduction/b ObjectEntity INTEGER(4)
+  !DEF: /OMP_REDUCTION/b ObjectEntity INTEGER(4)
   integer b(10,10,10)
 
   !$omp parallel  shared(k)
   !$omp do  reduction(+:k)
-  !DEF: /omp_reduction/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct1/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct1/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end do
@@ -24,53 +24,53 @@ program omp_reduction
 
 
   !$omp parallel do  reduction(+:a(10))
-  !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct2/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct2/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
 
   !$omp parallel do  reduction(+:a(1:10:1))
-  !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct3/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct3/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2))
-  !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct4/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct4/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2:5:1))
-  !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct5/k (OmpShared) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct5/k (OmpShared) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
 
   !$omp parallel  private(i)
   !$omp do reduction(+:k) reduction(+:j)
-  !DEF: /omp_reduction/OtherConstruct6/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct6/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct6/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct6/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end do
   !$omp end parallel
 
   !$omp do reduction(+:k) reduction(*:j) reduction(+:l)
-  !DEF: /omp_reduction/OtherConstruct7/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+  !DEF: /OMP_REDUCTION/OtherConstruct7/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !DEF: /omp_reduction/OtherConstruct7/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
+    !DEF: /OMP_REDUCTION/OtherConstruct7/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end do
-end program omp_reduction
+end program OMP_REDUCTION
diff --git a/flang/test/Semantics/OpenMP/reduction11.f90 b/flang/test/Semantics/OpenMP/reduction11.f90
index b2ad0f6a6ee11..dfb3986d37d78 100644
--- a/flang/test/Semantics/OpenMP/reduction11.f90
+++ b/flang/test/Semantics/OpenMP/reduction11.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s
 ! Check intrinsic reduction symbols (in this case "max" are marked as INTRINSIC
 
-! CHECK: MainProgram scope: omp_reduction
+! CHECK: MainProgram scope: OMP_REDUCTION
 program omp_reduction
   ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4)
   integer i
diff --git a/flang/test/Semantics/OpenMP/scan2.f90 b/flang/test/Semantics/OpenMP/scan2.f90
index ffe84910f88a2..1ae5e871595c4 100644
--- a/flang/test/Semantics/OpenMP/scan2.f90
+++ b/flang/test/Semantics/OpenMP/scan2.f90
@@ -1,7 +1,7 @@
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s
 ! Check scan reduction
 
-! CHECK: MainProgram scope: omp_reduction
+! CHECK: MainProgram scope: OMP_REDUCTION
 program omp_reduction
   ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4)
   integer i
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index fbd9a0286c79b..74fb420cc517e 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -16,53 +16,53 @@ module md
   integer :: b
  end type myty
 end module md
-!DEF: /mm MainProgram
-program mm
+!DEF: /MM MainProgram
+program MM
  !REF: /md
  use :: md
- !DEF: /mm/c CommonBlockDetails
- !DEF: /mm/x (InCommonBlock) ObjectEntity REAL(4)
- !DEF: /mm/y (InCommonBlock) ObjectEntity REAL(4)
+ !DEF: /MM/c CommonBlockDetails
+ !DEF: /MM/x (InCommonBlock) ObjectEntity REAL(4)
+ !DEF: /MM/y (InCommonBlock) ObjectEntity REAL(4)
  common /c/x, y
- !REF: /mm/x
- !REF: /mm/y
+ !REF: /MM/x
+ !REF: /MM/y
  real x, y
- !DEF: /mm/myty Use
- !DEF: /mm/t ObjectEntity TYPE(myty)
+ !DEF: /MM/myty Use
+ !DEF: /MM/t ObjectEntity TYPE(myty)
  type(myty) :: t
- !DEF: /mm/b ObjectEntity INTEGER(4)
+ !DEF: /MM/b ObjectEntity INTEGER(4)
  integer b(10)
- !REF: /mm/t
+ !REF: /MM/t
  !REF: /md/myty/a
  t%a = 3.14
- !REF: /mm/t
+ !REF: /MM/t
  !REF: /md/myty/b
  t%b = 1
- !REF: /mm/b
+ !REF: /MM/b
  b = 2
- !DEF: /mm/a (Implicit) ObjectEntity REAL(4)
+ !DEF: /MM/a (Implicit) ObjectEntity REAL(4)
  a = 1.0
- !DEF: /mm/c (Implicit) ObjectEntity REAL(4)
+ !DEF: /MM/c (Implicit) ObjectEntity REAL(4)
  c = 2.0
 !$omp parallel do  private(a,t,/c/) shared(c)
- !DEF: /mm/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+ !DEF: /MM/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,10
-  !DEF: /mm/OtherConstruct1/a (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
-  !DEF: /mm/OtherConstruct1/b (OmpShared) HostAssoc INTEGER(4)
-  !REF: /mm/OtherConstruct1/i
+  !DEF: /MM/OtherConstruct1/a (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
+  !DEF: /MM/OtherConstruct1/b (OmpShared) HostAssoc INTEGER(4)
+  !REF: /MM/OtherConstruct1/i
   a = a+b(i)
-  !DEF: /mm/OtherConstruct1/t (OmpPrivate, OmpExplicit) HostAssoc TYPE(myty)
+  !DEF: /MM/OtherConstruct1/t (OmpPrivate, OmpExplicit) HostAssoc TYPE(myty)
   !REF: /md/myty/a
-  !REF: /mm/OtherConstruct1/i
+  !REF: /MM/OtherConstruct1/i
   t%a = i
-  !DEF: /mm/OtherConstruct1/y (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
+  !DEF: /MM/OtherConstruct1/y (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
   y = 0.
-  !DEF: /mm/OtherConstruct1/x (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
-  !REF: /mm/OtherConstruct1/a
-  !REF: /mm/OtherConstruct1/i
-  !REF: /mm/OtherConstruct1/y
+  !DEF: /MM/OtherConstruct1/x (OmpPrivate, OmpExplicit) HostAssoc REAL(4)
+  !REF: /MM/OtherConstruct1/a
+  !REF: /MM/OtherConstruct1/i
+  !REF: /MM/OtherConstruct1/y
   x = a+i+y
-  !DEF: /mm/OtherConstruct1/c (OmpShared, OmpExplicit) HostAssoc REAL(4)
+  !DEF: /MM/OtherConstruct1/c (OmpShared, OmpExplicit) HostAssoc REAL(4)
   c = 3.0
  end do
 end program
diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90
index fe01f15d20aa3..4f3d1926013dc 100644
--- a/flang/test/Semantics/OpenMP/symbol05.f90
+++ b/flang/test/Semantics/OpenMP/symbol05.f90
@@ -31,10 +31,10 @@ subroutine foo
     end block
   end subroutine foo
 end module mm
-!DEF: /tt MainProgram
-program tt
+!DEF: /TT MainProgram
+program TT
   !REF: /mm
   use :: mm
-  !DEF: /tt/foo (Subroutine) Use
+  !DEF: /TT/foo (Subroutine) Use
   call foo
-end program tt
+end program TT
diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90
index 86b7305411347..1b0c25b7a04b0 100644
--- a/flang/test/Semantics/OpenMP/symbol07.f90
+++ b/flang/test/Semantics/OpenMP/symbol07.f90
@@ -30,8 +30,8 @@ subroutine function_call_in_region
   !REF: /function_call_in_region/b
   print *, a, b
 end subroutine function_call_in_region
-!DEF: /mm MainProgram
-program mm
+!DEF: /MM MainProgram
+program MM
   !REF: /function_call_in_region
   call function_call_in_region
-end program mm
+end program MM
diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90
index 86b7305411347..1b0c25b7a04b0 100644
--- a/flang/test/Semantics/OpenMP/symbol09.f90
+++ b/flang/test/Semantics/OpenMP/symbol09.f90
@@ -30,8 +30,8 @@ subroutine function_call_in_region
   !REF: /function_call_in_region/b
   print *, a, b
 end subroutine function_call_in_region
-!DEF: /mm MainProgram
-program mm
+!DEF: /MM MainProgram
+program MM
   !REF: /function_call_in_region
   call function_call_in_region
-end program mm
+end program MM
diff --git a/flang/test/Semantics/OpenMP/threadprivate03.f90 b/flang/test/Semantics/OpenMP/threadprivate03.f90
index 81e26ee327a9d..fda2fe608ac3c 100644
--- a/flang/test/Semantics/OpenMP/threadprivate03.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate03.f90
@@ -10,11 +10,11 @@ program main
   use mod1
   integer, parameter :: i = 1
 
-  !ERROR: The module name or main program name cannot be in a THREADPRIVATE directive
+  !ERROR: The module name cannot be in a THREADPRIVATE directive
   !$omp threadprivate(mod1)
 
-  !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
-  !ERROR: The module name or main program name cannot be in a THREADPRIVATE directive
+  ! This is now allowed, since "main" is implicitly declared symbol,
+  ! separate from the main program symbol.
   !$omp threadprivate(main)
 
   !ERROR: The entity with PARAMETER attribute cannot be in a THREADPRIVATE directive
diff --git a/flang/test/Semantics/bind-c18.f90 b/flang/test/Semantics/bind-c18.f90
new file mode 100644
index 0000000000000..f61111458c6d9
--- /dev/null
+++ b/flang/test/Semantics/bind-c18.f90
@@ -0,0 +1,7 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+bind(c) :: /blk/
+!ERROR: 'x' may not be a member of BIND(C) COMMON block /blk/
+common /blk/ x
+!BECAUSE: A scalar interoperable variable may not be ALLOCATABLE or POINTER
+integer, pointer :: x
+end
diff --git a/flang/test/Semantics/bug148559.f90 b/flang/test/Semantics/bug148559.f90
new file mode 100644
index 0000000000000..d7b959ac8f191
--- /dev/null
+++ b/flang/test/Semantics/bug148559.f90
@@ -0,0 +1,12 @@
+!RUN: %flang_fc1 -fsyntax-only %s
+!Regression test for crash in semantics on Cray pointers
+
+module m
+  pointer(ptr,pp)
+end module m
+
+program main
+  use m, only:renamea=>pp
+  use m, only:pp
+  print *, renamea
+end
diff --git a/flang/test/Semantics/bug148675.f90 b/flang/test/Semantics/bug148675.f90
new file mode 100644
index 0000000000000..5ce117e7bb3df
--- /dev/null
+++ b/flang/test/Semantics/bug148675.f90
@@ -0,0 +1,21 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+module m
+  type t
+    integer n
+   contains
+    procedure :: assign1 => myassign, assign2 => myassign
+    generic :: ASSIGNMENT(=) => assign1
+    generic :: ASSIGNMENT(=) => assign2
+  end type
+ contains
+  subroutine myassign(to, from)
+    class(t), intent(out) :: to
+    integer, intent(in) :: from
+    to%n = from
+  end
+  subroutine test
+    type(t) x
+    !ERROR: Multiple specific procedures for the generic ASSIGNMENT(=) match operand types TYPE(t) and INTEGER(4)
+    x = 5
+  end
+end
diff --git a/flang/test/Semantics/getsymbols03-a.f90 b/flang/test/Semantics/getsymbols03-a.f90
index 95b7fb418367d..5c5e87575a9cb 100644
--- a/flang/test/Semantics/getsymbols03-a.f90
+++ b/flang/test/Semantics/getsymbols03-a.f90
@@ -8,7 +8,7 @@ program main
 end program
 
 ! RUN: %flang_fc1 -fget-symbols-sources %s 2>&1 | FileCheck %s
+! CHECK:MAIN:{{.*}}getsymbols03-a.f90, 4, 9-13
 ! CHECK:f:{{.*}}getsymbols03-b.f90, 2, 12-13
-! CHECK:main:{{.*}}getsymbols03-a.f90, 4, 9-13
 ! CHECK:mm3:{{.*}}getsymbols03-a.f90, 5, 6-9
 ! CHECK:x:{{.*}}getsymbols03-a.f90, 6, 13-14
diff --git a/flang/test/Semantics/long-name.f90 b/flang/test/Semantics/long-name.f90
index 44899b13edd5a..d5a795113e204 100644
--- a/flang/test/Semantics/long-name.f90
+++ b/flang/test/Semantics/long-name.f90
@@ -1,6 +1,6 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror -pedantic
 
-!PORTABILITY: aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg1 has length 64, which is greater than the maximum name length 63 [-Wlong-names]
+!PORTABILITY: AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEFFFFFFFFFFGGG1 has length 64, which is greater than the maximum name length 63 [-Wlong-names]
 program aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg1
 
   !PORTABILITY: aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg2 has length 64, which is greater than the maximum name length 63 [-Wlong-names]
diff --git a/flang/test/Semantics/modproc01.f90 b/flang/test/Semantics/modproc01.f90
index 5f45362e95093..e565ddcfbe0b1 100644
--- a/flang/test/Semantics/modproc01.f90
+++ b/flang/test/Semantics/modproc01.f90
@@ -125,7 +125,7 @@ program test
   x = mf(3, "abc", pdt1(1,3)())
 !  call ms(mf)
 end program
-!CHECK:  MainProgram scope: test size=88 alignment=8
+!CHECK:  MainProgram scope: TEST size=88 alignment=8
 !CHECK:    mf, MODULE (Function): Use from mf in m
 !CHECK:    pdt1: Use from pdt1 in m
 !CHECK:    pdt2: Use from pdt2 in m
diff --git a/flang/test/Semantics/multi-programs04.f90 b/flang/test/Semantics/multi-programs04.f90
index 54b0235aa78f0..e69ac7325278e 100644
--- a/flang/test/Semantics/multi-programs04.f90
+++ b/flang/test/Semantics/multi-programs04.f90
@@ -4,6 +4,6 @@
 program m
 end
 !ERROR: A source file cannot contain more than one main program
-!ERROR: 'm' is already declared in this scoping unit
+!ERROR: 'M' is already declared in this scoping unit
 program m
 end
diff --git a/flang/test/Semantics/pointer01.f90 b/flang/test/Semantics/pointer01.f90
index eaa2426dd77e3..79d6016a6af46 100644
--- a/flang/test/Semantics/pointer01.f90
+++ b/flang/test/Semantics/pointer01.f90
@@ -7,7 +7,6 @@ subroutine msubr
 end module
 program main
   use m
-  !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
   pointer main
   !ERROR: Cannot change POINTER attribute on use-associated 'mobj'
   pointer mobj
diff --git a/flang/test/Semantics/procinterface01.f90 b/flang/test/Semantics/procinterface01.f90
index 73040b0987bd0..70f4a889d6809 100644
--- a/flang/test/Semantics/procinterface01.f90
+++ b/flang/test/Semantics/procinterface01.f90
@@ -159,35 +159,35 @@ end function logical
  tan = "?"
 end function tan
 
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  !REF: /module1
  use :: module1
- !DEF: /main/derived1 Use
- !DEF: /main/instance ObjectEntity TYPE(derived1)
+ !DEF: /MAIN/derived1 Use
+ !DEF: /MAIN/instance ObjectEntity TYPE(derived1)
  type(derived1) :: instance
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p1
  if (instance%p1(1.)/=2.) print *, "p1 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p2
  if (instance%p2(1.)/=2.) print *, "p2 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p3
  if (.not.instance%p3(1.)) print *, "p3 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p4
  if (.not.instance%p4(1.)) print *, "p4 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p5
  if (instance%p5(1.)/=(5.,6.)) print *, "p5 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p6
  if (instance%p6(1.)/=2.) print *, "p6 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p7
  if (instance%p7(0.)/=1.) print *, "p7 failed"
- !REF: /main/instance
+ !REF: /MAIN/instance
  !REF: /module1/derived1/p8
  if (instance%p8(1.)/="a") print *, "p8 failed"
-end program main
+end program MAIN
diff --git a/flang/test/Semantics/resolve05.f90 b/flang/test/Semantics/resolve05.f90
index 0c9877af9b4e2..7b142d2ebd613 100644
--- a/flang/test/Semantics/resolve05.f90
+++ b/flang/test/Semantics/resolve05.f90
@@ -1,6 +1,5 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 program p
-  !PORTABILITY: Name 'p' declared in a main program should not have the same name as the main program [-Wbenign-name-clash]
   integer :: p
 end
 module m
diff --git a/flang/test/Semantics/resolve125.f90 b/flang/test/Semantics/resolve125.f90
index e040c006ec179..620c7d65578cd 100644
--- a/flang/test/Semantics/resolve125.f90
+++ b/flang/test/Semantics/resolve125.f90
@@ -43,7 +43,7 @@ subroutine reset
   end subroutine reset
 end module m2
 
-!CHECK: MainProgram scope: main
+!CHECK: MainProgram scope: MAIN
 !CHECK:       i: Use from i in m2
 !CHECK:       i2: Use from i2 in m2
 !CHECK:       init (Subroutine): Use from init in m2
@@ -61,4 +61,4 @@ program main
   else
     print *, "fail"
   end if
-end program main
\ No newline at end of file
+end program main
diff --git a/flang/test/Semantics/symbol03.f90 b/flang/test/Semantics/symbol03.f90
index a6b4b0bd15937..62472495d9736 100644
--- a/flang/test/Semantics/symbol03.f90
+++ b/flang/test/Semantics/symbol03.f90
@@ -1,23 +1,23 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
 ! Test host association in internal subroutine of main program.
 
-!DEF: /main MainProgram
-program main
- !DEF: /main/x ObjectEntity INTEGER(4)
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/x ObjectEntity INTEGER(4)
  integer x
- !DEF: /main/s (Subroutine) Subprogram
+ !DEF: /MAIN/s (Subroutine) Subprogram
  call s
 contains
- !REF: /main/s
+ !REF: /MAIN/s
  subroutine s
-  !DEF: /main/s/y (Implicit) ObjectEntity REAL(4)
-  !DEF: /main/s/x HostAssoc INTEGER(4)
+  !DEF: /MAIN/s/y (Implicit) ObjectEntity REAL(4)
+  !DEF: /MAIN/s/x HostAssoc INTEGER(4)
   y = x
  contains
-  !DEF: /main/s/s2 (Subroutine) Subprogram
+  !DEF: /MAIN/s/s2 (Subroutine) Subprogram
   subroutine s2
-   !DEF: /main/s/s2/z (Implicit) ObjectEntity REAL(4)
-   !DEF: /main/s/s2/x HostAssoc INTEGER(4)
+   !DEF: /MAIN/s/s2/z (Implicit) ObjectEntity REAL(4)
+   !DEF: /MAIN/s/s2/x HostAssoc INTEGER(4)
    z = x
   end subroutine
  end subroutine
diff --git a/flang/test/Semantics/symbol06.f90 b/flang/test/Semantics/symbol06.f90
index bbd6d4d071c89..b45edabcd5318 100644
--- a/flang/test/Semantics/symbol06.f90
+++ b/flang/test/Semantics/symbol06.f90
@@ -1,56 +1,56 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
-!DEF: /main MainProgram
-program main
- !DEF: /main/t1 DerivedType
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/t1 DerivedType
  type :: t1
-  !DEF: /main/t1/a1 ObjectEntity INTEGER(4)
+  !DEF: /MAIN/t1/a1 ObjectEntity INTEGER(4)
   integer :: a1
  end type
- !REF: /main/t1
- !DEF: /main/t2 DerivedType
+ !REF: /MAIN/t1
+ !DEF: /MAIN/t2 DerivedType
  type, extends(t1) :: t2
-  !DEF: /main/t2/a2 ObjectEntity INTEGER(4)
+  !DEF: /MAIN/t2/a2 ObjectEntity INTEGER(4)
   integer :: a2
  end type
- !REF: /main/t2
- !DEF: /main/t3 DerivedType
+ !REF: /MAIN/t2
+ !DEF: /MAIN/t3 DerivedType
  type, extends(t2) :: t3
-  !DEF: /main/t3/a3 ObjectEntity INTEGER(4)
+  !DEF: /MAIN/t3/a3 ObjectEntity INTEGER(4)
   integer :: a3
  end type
- !REF: /main/t3
- !DEF: /main/x3 ObjectEntity TYPE(t3)
+ !REF: /MAIN/t3
+ !DEF: /MAIN/x3 ObjectEntity TYPE(t3)
  type(t3) :: x3
- !DEF: /main/i ObjectEntity INTEGER(4)
+ !DEF: /MAIN/i ObjectEntity INTEGER(4)
  integer i
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t2/a2
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t2/a2
  i = x3%a2
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t1/a1
  i = x3%a1
- !REF: /main/i
- !REF: /main/x3
- !DEF: /main/t3/t2 (ParentComp) ObjectEntity TYPE(t2)
- !REF: /main/t2/a2
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !DEF: /MAIN/t3/t2 (ParentComp) ObjectEntity TYPE(t2)
+ !REF: /MAIN/t2/a2
  i = x3%t2%a2
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t3/t2
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t3/t2
+ !REF: /MAIN/t1/a1
  i = x3%t2%a1
- !REF: /main/i
- !REF: /main/x3
- !DEF: /main/t2/t1 (ParentComp) ObjectEntity TYPE(t1)
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !DEF: /MAIN/t2/t1 (ParentComp) ObjectEntity TYPE(t1)
+ !REF: /MAIN/t1/a1
  i = x3%t1%a1
- !REF: /main/i
- !REF: /main/x3
- !REF: /main/t3/t2
- !REF: /main/t2/t1
- !REF: /main/t1/a1
+ !REF: /MAIN/i
+ !REF: /MAIN/x3
+ !REF: /MAIN/t3/t2
+ !REF: /MAIN/t2/t1
+ !REF: /MAIN/t1/a1
  i = x3%t2%t1%a1
 end program
 
diff --git a/flang/test/Semantics/symbol07.f90 b/flang/test/Semantics/symbol07.f90
index f3cc934e51b16..e1d8257b9e190 100644
--- a/flang/test/Semantics/symbol07.f90
+++ b/flang/test/Semantics/symbol07.f90
@@ -1,40 +1,40 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  implicit complex(z)
- !DEF: /main/t DerivedType
+ !DEF: /MAIN/t DerivedType
  type :: t
-  !DEF: /main/t/re ObjectEntity REAL(4)
+  !DEF: /MAIN/t/re ObjectEntity REAL(4)
   real :: re
-  !DEF: /main/t/im ObjectEntity REAL(4)
+  !DEF: /MAIN/t/im ObjectEntity REAL(4)
   real :: im
  end type
- !DEF: /main/z1 ObjectEntity COMPLEX(4)
+ !DEF: /MAIN/z1 ObjectEntity COMPLEX(4)
  complex z1
- !REF: /main/t
- !DEF: /main/w ObjectEntity TYPE(t)
+ !REF: /MAIN/t
+ !DEF: /MAIN/w ObjectEntity TYPE(t)
  type(t) :: w
- !DEF: /main/x ObjectEntity REAL(4)
- !DEF: /main/y ObjectEntity REAL(4)
+ !DEF: /MAIN/x ObjectEntity REAL(4)
+ !DEF: /MAIN/y ObjectEntity REAL(4)
  real x, y
- !REF: /main/x
- !REF: /main/z1
+ !REF: /MAIN/x
+ !REF: /MAIN/z1
  x = z1%re
- !REF: /main/y
- !REF: /main/z1
+ !REF: /MAIN/y
+ !REF: /MAIN/z1
  y = z1%im
- !DEF: /main/z2 (Implicit) ObjectEntity COMPLEX(4)
- !REF: /main/x
+ !DEF: /MAIN/z2 (Implicit) ObjectEntity COMPLEX(4)
+ !REF: /MAIN/x
  z2%re = x
- !REF: /main/z2
- !REF: /main/y
+ !REF: /MAIN/z2
+ !REF: /MAIN/y
  z2%im = y
- !REF: /main/x
- !REF: /main/w
- !REF: /main/t/re
+ !REF: /MAIN/x
+ !REF: /MAIN/w
+ !REF: /MAIN/t/re
  x = w%re
- !REF: /main/y
- !REF: /main/w
- !REF: /main/t/im
+ !REF: /MAIN/y
+ !REF: /MAIN/w
+ !REF: /MAIN/t/im
  y = w%im
 end program
diff --git a/flang/test/Semantics/symbol08.f90 b/flang/test/Semantics/symbol08.f90
index 61dab798955c5..933ff6d0c2ba8 100644
--- a/flang/test/Semantics/symbol08.f90
+++ b/flang/test/Semantics/symbol08.f90
@@ -1,15 +1,15 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
-!DEF: /main MainProgram
-program main
- !DEF: /main/x POINTER ObjectEntity REAL(4)
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/x POINTER ObjectEntity REAL(4)
  pointer :: x
- !REF: /main/x
+ !REF: /MAIN/x
  real x
- !DEF: /main/y EXTERNAL, POINTER (Function) ProcEntity REAL(4)
+ !DEF: /MAIN/y EXTERNAL, POINTER (Function) ProcEntity REAL(4)
  pointer :: y
- !REF: /main/y
+ !REF: /MAIN/y
  procedure (real) :: y
- !DEF: /main/z (Implicit) ObjectEntity REAL(4)
- !REF: /main/y
+ !DEF: /MAIN/z (Implicit) ObjectEntity REAL(4)
+ !REF: /MAIN/y
  z = y()
 end program
diff --git a/flang/test/Semantics/symbol15.f90 b/flang/test/Semantics/symbol15.f90
index df10942e6af2d..79a45491306ef 100644
--- a/flang/test/Semantics/symbol15.f90
+++ b/flang/test/Semantics/symbol15.f90
@@ -249,15 +249,15 @@ subroutine ext2
 !DEF: /ext3 (Subroutine) Subprogram
 subroutine ext3
 end subroutine
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  !REF: /m
  use :: m
- !DEF: /main/pdt1 Use
- !DEF: /main/pdt1y ObjectEntity TYPE(pdt1(k=2_4))
+ !DEF: /MAIN/pdt1 Use
+ !DEF: /MAIN/pdt1y ObjectEntity TYPE(pdt1(k=2_4))
  type(pdt1(2)) :: pdt1y
- !DEF: /main/pdt2 Use
- !DEF: /main/pdt2y ObjectEntity TYPE(pdt2(k=2_4))
+ !DEF: /MAIN/pdt2 Use
+ !DEF: /MAIN/pdt2y ObjectEntity TYPE(pdt2(k=2_4))
  type(pdt2(2)) :: pdt2y
  print *, "compiled"
 end program
diff --git a/flang/test/Semantics/symbol16.f90 b/flang/test/Semantics/symbol16.f90
index 7a46092c36b53..547c4624d4cdb 100644
--- a/flang/test/Semantics/symbol16.f90
+++ b/flang/test/Semantics/symbol16.f90
@@ -1,18 +1,18 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
 ! Statement functions
 
-!DEF: /p1 MainProgram
-program p1
- !DEF: /p1/f (Function, StmtFunction) Subprogram INTEGER(4)
- !DEF: /p1/i ObjectEntity INTEGER(4)
- !DEF: /p1/j ObjectEntity INTEGER(4)
+!DEF: /P1 MainProgram
+program P1
+ !DEF: /P1/f (Function, StmtFunction) Subprogram INTEGER(4)
+ !DEF: /P1/i ObjectEntity INTEGER(4)
+ !DEF: /P1/j ObjectEntity INTEGER(4)
  integer f, i, j
- !REF: /p1/f
- !REF: /p1/i
- !DEF: /p1/f/i ObjectEntity INTEGER(4)
+ !REF: /P1/f
+ !REF: /P1/i
+ !DEF: /P1/f/i ObjectEntity INTEGER(4)
  f(i) = i + 1
- !REF: /p1/j
- !REF: /p1/f
+ !REF: /P1/j
+ !REF: /P1/f
  j = f(2)
 end program
 
diff --git a/flang/test/Semantics/symbol17.f90 b/flang/test/Semantics/symbol17.f90
index 434f124509a32..a0d916e55cfa4 100644
--- a/flang/test/Semantics/symbol17.f90
+++ b/flang/test/Semantics/symbol17.f90
@@ -1,44 +1,44 @@
 ! RUN: %python %S/test_symbols.py %s %flang_fc1
 ! Forward references to derived types (non-error cases)
 
-!DEF: /main MainProgram
-program main
- !DEF: /main/t1 DerivedType
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/t1 DerivedType
  type :: t1
-  !DEF: /main/t2 DerivedType
-  !DEF: /main/t1/t1a ALLOCATABLE ObjectEntity TYPE(t2)
+  !DEF: /MAIN/t2 DerivedType
+  !DEF: /MAIN/t1/t1a ALLOCATABLE ObjectEntity TYPE(t2)
   type(t2), allocatable :: t1a
-  !REF: /main/t2
-  !DEF: /main/t1/t1p POINTER ObjectEntity TYPE(t2)
+  !REF: /MAIN/t2
+  !DEF: /MAIN/t1/t1p POINTER ObjectEntity TYPE(t2)
   type(t2), pointer :: t1p
  end type
- !REF: /main/t2
+ !REF: /MAIN/t2
  type :: t2
-  !REF: /main/t2
-  !DEF: /main/t2/t2a ALLOCATABLE ObjectEntity TYPE(t2)
+  !REF: /MAIN/t2
+  !DEF: /MAIN/t2/t2a ALLOCATABLE ObjectEntity TYPE(t2)
   type(t2), allocatable :: t2a
-  !REF: /main/t2
-  !DEF: /main/t2/t2p POINTER ObjectEntity TYPE(t2)
+  !REF: /MAIN/t2
+  !DEF: /MAIN/t2/t2p POINTER ObjectEntity TYPE(t2)
   type(t2), pointer :: t2p
  end type
- !REF: /main/t1
- !DEF: /main/t1x TARGET ObjectEntity TYPE(t1)
+ !REF: /MAIN/t1
+ !DEF: /MAIN/t1x TARGET ObjectEntity TYPE(t1)
  type(t1), target :: t1x
- !REF: /main/t1x
- !REF: /main/t1/t1a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1a
  allocate(t1x%t1a)
- !REF: /main/t1x
- !REF: /main/t1/t1p
- !REF: /main/t1/t1a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1p
+ !REF: /MAIN/t1/t1a
  t1x%t1p => t1x%t1a
- !REF: /main/t1x
- !REF: /main/t1/t1a
- !REF: /main/t2/t2a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1a
+ !REF: /MAIN/t2/t2a
  allocate(t1x%t1a%t2a)
- !REF: /main/t1x
- !REF: /main/t1/t1a
- !REF: /main/t2/t2p
- !REF: /main/t2/t2a
+ !REF: /MAIN/t1x
+ !REF: /MAIN/t1/t1a
+ !REF: /MAIN/t2/t2p
+ !REF: /MAIN/t2/t2a
  t1x%t1a%t2p => t1x%t1a%t2a
 end program
 !DEF: /f1/fwd DerivedType
diff --git a/flang/test/Semantics/symbol18.f90 b/flang/test/Semantics/symbol18.f90
index a37792bce21d7..6e41bb5db91ee 100644
--- a/flang/test/Semantics/symbol18.f90
+++ b/flang/test/Semantics/symbol18.f90
@@ -2,21 +2,21 @@
 
 ! Intrinsic function in type declaration statement: type is ignored
 
-!DEF: /p1 MainProgram
-program p1
- !DEF: /p1/cos ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity INTEGER(4)
+!DEF: /P1 MainProgram
+program P1
+ !DEF: /P1/cos ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity INTEGER(4)
  integer cos
- !DEF: /p1/y (Implicit) ObjectEntity REAL(4)
- !REF: /p1/cos
- !DEF: /p1/x (Implicit) ObjectEntity REAL(4)
+ !DEF: /P1/y (Implicit) ObjectEntity REAL(4)
+ !REF: /P1/cos
+ !DEF: /P1/x (Implicit) ObjectEntity REAL(4)
  y = cos(x)
- !REF: /p1/y
- !DEF: /p1/sin ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
- !REF: /p1/x
+ !REF: /P1/y
+ !DEF: /P1/sin ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
+ !REF: /P1/x
  y = sin(x)
- !REF: /p1/y
+ !REF: /P1/y
  !DEF: /f EXTERNAL (Function, Implicit) ProcEntity REAL(4)
- !REF: /p1/x
+ !REF: /P1/x
  y = f(x)
 end program
 
diff --git a/flang/test/Semantics/symbol20.f90 b/flang/test/Semantics/symbol20.f90
index 8c82776933321..bf3aff489b3b9 100644
--- a/flang/test/Semantics/symbol20.f90
+++ b/flang/test/Semantics/symbol20.f90
@@ -32,16 +32,16 @@ subroutine bar
   print *, "in bar"
  end subroutine
 end module
-!DEF: /demo MainProgram
-program demo
+!DEF: /DEMO MainProgram
+program DEMO
  !REF: /m
  use :: m
- !DEF: /demo/bar (Subroutine) Use
- !DEF: /demo/p EXTERNAL, POINTER (Subroutine) ProcEntity
+ !DEF: /DEMO/bar (Subroutine) Use
+ !DEF: /DEMO/p EXTERNAL, POINTER (Subroutine) ProcEntity
  procedure(bar), pointer :: p
- !REF: /demo/p
- !DEF: /demo/foo (Function) Use
+ !REF: /DEMO/p
+ !DEF: /DEMO/foo (Function) Use
  p => foo()
- !REF: /demo/p
+ !REF: /DEMO/p
  call p
 end program
diff --git a/flang/test/Semantics/symbol25.f90 b/flang/test/Semantics/symbol25.f90
index ac3dd37ef92eb..ac47a19eae8cc 100644
--- a/flang/test/Semantics/symbol25.f90
+++ b/flang/test/Semantics/symbol25.f90
@@ -38,23 +38,23 @@ subroutine inner1
   end subroutine inner1
  end subroutine outer
 end module m
-!DEF: /main MainProgram
-program main
+!DEF: /MAIN MainProgram
+program MAIN
  !REF: /m
  use :: m
  !REF: /m/specific1
  call generic
- !DEF: /main/inner2 (Subroutine) Subprogram
+ !DEF: /MAIN/inner2 (Subroutine) Subprogram
  call inner2
 contains
- !REF: /main/inner2
+ !REF: /MAIN/inner2
  subroutine inner2
-  !DEF: /main/inner2/generic (Subroutine) Generic
+  !DEF: /MAIN/inner2/generic (Subroutine) Generic
   interface generic
-   !DEF: /main/specific2 (Subroutine) Use
+   !DEF: /MAIN/specific2 (Subroutine) Use
    module procedure :: specific2
   end interface
-  !REF: /main/specific2
+  !REF: /MAIN/specific2
   call generic
  end subroutine inner2
 end program
diff --git a/flang/test/Semantics/symbol26.f90 b/flang/test/Semantics/symbol26.f90
index f5e95853ca099..dded4b632c654 100644
--- a/flang/test/Semantics/symbol26.f90
+++ b/flang/test/Semantics/symbol26.f90
@@ -8,16 +8,16 @@ module m
  !DEF: /m/j PUBLIC (Implicit, InNamelist) ObjectEntity INTEGER(4)
  namelist/a/j
 end module m
-!DEF: /main MainProgram
-program main
- !DEF: /main/j (Implicit) ObjectEntity INTEGER(4)
+!DEF: /MAIN MainProgram
+program MAIN
+ !DEF: /MAIN/j (Implicit) ObjectEntity INTEGER(4)
  j = 1
 contains
- !DEF: /main/inner (Subroutine) Subprogram
+ !DEF: /MAIN/inner (Subroutine) Subprogram
  subroutine inner
   !REF: /m
   use :: m
-  !DEF: /main/inner/j (Implicit, InNamelist) Use INTEGER(4)
+  !DEF: /MAIN/inner/j (Implicit, InNamelist) Use INTEGER(4)
   j = 2
  end subroutine
 end program
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index bb20c546e0261..aeec336ea58ea 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -87,8 +87,8 @@ subroutine s2(x, y)
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
-!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)]
+!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 !CHECK: .v.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s2,name=.n.s1)]
 end module
@@ -115,8 +115,8 @@ subroutine s2(x, y)
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
-!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)]
+!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 !CHECK: .v.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s2,name=.n.s1)]
 end module
@@ -133,7 +133,7 @@ impure elemental subroutine s1(x, y)
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
 
@@ -156,7 +156,7 @@ subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=1_1,proc=s4)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,specialcaseflag=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,specialcaseflag=1_1,proc=s4)]
 end module
 
 module m09
@@ -198,7 +198,7 @@ subroutine wu(x,u,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,specialcaseflag=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,specialcaseflag=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,specialcaseflag=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
 
@@ -247,7 +247,7 @@ subroutine wu(x,u,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=wu)]
 end module
 
 module m11
@@ -290,7 +290,7 @@ module m13
    contains
     procedure :: assign1, assign2
     generic :: assignment(=) => assign1, assign2
-    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)]
+    ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=assign1)]
   end type
  contains
   impure elemental subroutine assign1(to, from)
diff --git a/flang/test/Semantics/typeinfo02.f90 b/flang/test/Semantics/typeinfo02.f90
index 29d14c7a0f196..07293627ab492 100644
--- a/flang/test/Semantics/typeinfo02.f90
+++ b/flang/test/Semantics/typeinfo02.f90
@@ -29,5 +29,5 @@ subroutine wf2(x,u,iot,v,iostat,iomsg)
     character(len=*), intent(inout) :: iomsg
   end subroutine
 end module
-!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)]
-!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)]
+!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=wf1)]
+!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=wf2)]
diff --git a/flang/test/Semantics/typeinfo09.f90 b/flang/test/Semantics/typeinfo09.f90
index 3527ee6058ad8..8daa6a5f420d7 100644
--- a/flang/test/Semantics/typeinfo09.f90
+++ b/flang/test/Semantics/typeinfo09.f90
@@ -17,4 +17,4 @@ subroutine copy_impl(this, x)
 end interface
 end module
 
-!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=copy_impl)]
+!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=copy_impl)]
diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90
index ad824ad3590a2..facc280815722 100644
--- a/flang/test/Semantics/typeinfo13.f90
+++ b/flang/test/Semantics/typeinfo13.f90
@@ -22,5 +22,5 @@ impure elemental subroutine override(to, from)
   end
 end
 
-!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=override)]
+!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,specialcaseflag=0_1,proc=override)]
 !CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)]
diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90
index 12f63031cbaee..6f24b346e3fb9 100644
--- a/flang/test/Transforms/DoConcurrent/basic_host.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_host.f90
@@ -5,7 +5,7 @@
 ! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
 ! RUN:   | FileCheck %s
  
-! CHECK-LABEL: do_concurrent_basic
+! CHECK-LABEL: DO_CONCURRENT_BASIC
 program do_concurrent_basic
     ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 
diff --git a/flang/test/Transforms/lower-repack-arrays.fir b/flang/test/Transforms/lower-repack-arrays.fir
index 458869cce45fd..9232a74f224d3 100644
--- a/flang/test/Transforms/lower-repack-arrays.fir
+++ b/flang/test/Transforms/lower-repack-arrays.fir
@@ -28,15 +28,14 @@ func.func @_QPtest1(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"})
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_20]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -93,15 +92,14 @@ func.func @_QPtest1_whole(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name =
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.alloca !fir.array<?x?xf32>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_20]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -156,15 +154,14 @@ func.func @_QPtest1_in(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x
 // CHECK:               %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_17:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
+// CHECK:               %[[VAL_24:.*]] = fir.shape_shift %[[VAL_14]]#0, %[[VAL_14]]#1, %[[VAL_15]]#0, %[[VAL_15]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_24]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
 // CHECK:               %[[VAL_20:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_24:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_25:.*]] = fir.rebox %[[VAL_19]](%[[VAL_24]]) : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_25]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_19]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -215,10 +212,9 @@ func.func @_QPtest1_out(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "
 // CHECK:               %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_17:.*]] = fir.allocmem !fir.array<?x?xf32>, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?xf32>>
-// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
-// CHECK:               %[[VAL_20:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_21:.*]] = fir.rebox %[[VAL_19]](%[[VAL_20]]) : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
-// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?xf32>>
+// CHECK:               %[[VAL_20:.*]] = fir.shape_shift %[[VAL_14]]#0, %[[VAL_14]]#1, %[[VAL_15]]#0, %[[VAL_15]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_20]]) : (!fir.heap<!fir.array<?x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?xf32>>
+// CHECK:               fir.result %[[VAL_19]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?xf32>>
 // CHECK:             }
@@ -286,15 +282,14 @@ func.func @_QPtest2(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.box
 // CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_24:.*]] = fir.allocmem !fir.array<?x?x!fir.char<1,?>>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.heap<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>
+// CHECK:               %[[VAL_31:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1, %[[VAL_22]]#0, %[[VAL_22]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_31]]) typeparams %[[VAL_12]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, i32) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_32]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_1]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -362,15 +357,14 @@ func.func @_QPtest2_stack(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !f
 // CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_24:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,?>>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, i32) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               %[[VAL_31:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1, %[[VAL_22]]#0, %[[VAL_22]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_31]]) typeparams %[[VAL_12]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, i32) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_32]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_1]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -427,15 +421,14 @@ func.func @_QPtest3(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.bindc_n
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.allocmem !fir.array<?x?x!fir.char<1,?>>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.heap<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) typeparams %[[VAL_17]] : (!fir.heap<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, index) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,?>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -493,15 +486,14 @@ func.func @_QPtest3_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,?>>> {fir.b
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,?>>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) typeparams %[[VAL_17]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shapeshift<2>, index) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,?>>>
 // CHECK:             }
@@ -559,15 +551,14 @@ func.func @_QPtest4(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.bindc_
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.allocmem !fir.array<?x?x!fir.char<1,10>>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>, index) -> !fir.heap<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,10>>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1, %[[VAL_17]]#0, %[[VAL_17]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) : (!fir.heap<!fir.array<?x?x!fir.char<1,10>>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,10>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,10>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             }
@@ -626,15 +617,14 @@ func.func @_QPtest4_stack(%arg0: !fir.box<!fir.array<?x?x!fir.char<1,10>>> {fir.
 // CHECK:               %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_19:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,10>>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>, index) -> !fir.ref<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.ref<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
+// CHECK:               %[[VAL_26:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1, %[[VAL_17]]#0, %[[VAL_17]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) : (!fir.ref<!fir.array<?x?x!fir.char<1,10>>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:               %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box<!fir.array<?x?x!fir.char<1,10>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,10>>>
-// CHECK:               fir.result %[[VAL_27]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
+// CHECK:               fir.result %[[VAL_21]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.char<1,10>>>
 // CHECK:             }
@@ -690,15 +680,15 @@ func.func @_QPtest5(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bind
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.allocmem !fir.array<?x?x!fir.type<_QMmTt>>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) source_box %[[VAL_0]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[BOX:.*]] = fir.convert %[[VAL_20]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_22:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               fir.result %[[BOX]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             }
@@ -755,15 +745,15 @@ func.func @_QPtest5_stack(%arg0: !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>> {fi
 // CHECK:               %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2>
 // CHECK:               %[[VAL_18:.*]] = fir.alloca !fir.array<?x?x!fir.type<_QMmTt>>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"}
 // CHECK:               %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) source_box %[[VAL_0]] : (!fir.ref<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[BOX:.*]] = fir.convert %[[VAL_20]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}}
-// CHECK:               %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_22:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
-// CHECK:               fir.result %[[VAL_26]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               fir.result %[[BOX]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[VAL_0]] : !fir.box<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             }
@@ -830,13 +820,14 @@ func.func @_QPtest6(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bi
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
@@ -906,13 +897,14 @@ func.func @_QPtest6_stack(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>>
@@ -981,13 +973,14 @@ func.func @_QPtest7(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_name = "x
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.heap<!fir.array<?x?xnone>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.heap<!fir.array<?x?xnone>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?xnone>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?xnone>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?xnone>>
@@ -1057,13 +1050,14 @@ func.func @_QPtest7_stack(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_nam
 // CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.heap<!fir.array<?x?xnone>>>
+// CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.heap<!fir.array<?x?xnone>>
+// CHECK:               %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+// CHECK:               %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap<!fir.array<?x?xnone>>, !fir.shapeshift<2>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref<!fir.char<1,{{.*}}>>
-// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>
+// CHECK:               %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
 // CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 // CHECK:               fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
-// CHECK:               %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2>
-// CHECK:               %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>, !fir.shift<2>) -> !fir.class<!fir.array<?x?xnone>>
 // CHECK:               fir.result %[[VAL_34]] : !fir.class<!fir.array<?x?xnone>>
 // CHECK:             } else {
 // CHECK:               fir.result %[[ARG0]] : !fir.class<!fir.array<?x?xnone>>
diff --git a/flang/unittests/Optimizer/Builder/CharacterTest.cpp b/flang/unittests/Optimizer/Builder/CharacterTest.cpp
index 6d912b81d9541..d8d2da40ba9a6 100644
--- a/flang/unittests/Optimizer/Builder/CharacterTest.cpp
+++ b/flang/unittests/Optimizer/Builder/CharacterTest.cpp
@@ -29,7 +29,7 @@ struct CharacterTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
index 689af4642b0b6..d5f00c9b61108 100644
--- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp
+++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
@@ -25,7 +25,7 @@ struct ComplexTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index 3e2af24c47b96..e4c21f6b65a36 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -29,7 +29,7 @@ struct FIRBuilderTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
@@ -176,8 +176,7 @@ TEST_F(FIRBuilderTest, getNamedFunction) {
   auto func2 = builder.getNamedFunction("func2");
   EXPECT_EQ(nullptr, func2);
   auto loc = builder.getUnknownLoc();
-  func2 = builder.createFunction(
-      loc, "func2", builder.getFunctionType(std::nullopt, std::nullopt));
+  func2 = builder.createFunction(loc, "func2", builder.getFunctionType({}, {}));
   auto func2query = builder.getNamedFunction("func2");
   EXPECT_EQ(func2, func2query);
 }
diff --git a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
index 29700d2d3dbff..a0785198b078d 100644
--- a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
+++ b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp
@@ -28,7 +28,7 @@ struct HLFIRToolsTest : public testing::Test {
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
     mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
-        loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt));
+        loc, "func1", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
index 40abf567400b3..4ecec92f42dc2 100644
--- a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
+++ b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h
@@ -26,9 +26,8 @@ struct RuntimeCallTest : public testing::Test {
     // Set the insertion point in the function entry block.
     moduleOp = builder.create<mlir::ModuleOp>(loc);
     builder.setInsertionPointToStart(moduleOp->getBody());
-    mlir::func::FuncOp func =
-        builder.create<mlir::func::FuncOp>(loc, "runtime_unit_tests_func",
-            builder.getFunctionType(std::nullopt, std::nullopt));
+    mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
+        loc, "runtime_unit_tests_func", builder.getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder.setInsertionPointToStart(entryBlock);
 
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 30c23b63b4d56..98270adaa7c73 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -21,9 +21,8 @@ struct FortranVariableTest : public testing::Test {
     // Set the insertion point in the function entry block.
     moduleOp = builder->create<mlir::ModuleOp>(loc);
     builder->setInsertionPointToStart(moduleOp->getBody());
-    mlir::func::FuncOp func =
-        builder->create<mlir::func::FuncOp>(loc, "fortran_variable_tests",
-            builder->getFunctionType(std::nullopt, std::nullopt));
+    mlir::func::FuncOp func = builder->create<mlir::func::FuncOp>(
+        loc, "fortran_variable_tests", builder->getFunctionType({}, {}));
     auto *entryBlock = func.addEntryBlock();
     builder->setInsertionPointToStart(entryBlock);
   }
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 507b3aa88babf..4f3704ec9aa9b 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -76,6 +76,13 @@ add_compile_definitions(LIBC_NAMESPACE=${LIBC_NAMESPACE})
 set(LIBC_COMPILE_OPTIONS_DEFAULT "" CACHE STRING "Architecture to tell clang to optimize for (e.g. -march=... or -mcpu=...)")
 set(LIBC_TEST_COMPILE_OPTIONS_DEFAULT "" CACHE STRING "Common compile options for all the tests.")
 
+set(LIBC_LINK_OPTIONS_DEFAULT "" CACHE STRING "Arguments used when linking.")
+set(LIBC_TEST_LINK_OPTIONS_DEFAULT "" CACHE STRING "Common link options for all the tests.")
+
+set(LIBC_TEST_CMD "" CACHE STRING
+  "The full test command in the form <command> binary=@BINARY@, if using another program to test (e.g. QEMU)")
+set(LIBC_TEST_HERMETIC_ONLY "" OFF CACHE BOOL "Only enable hermetic tests.")
+
 list(APPEND LIBC_COMPILE_OPTIONS_DEFAULT ${LIBC_COMMON_TUNE_OPTIONS})
 
 # Check --print-resource-dir to find the compiler resource dir if this flag
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 11937c8c8ab08..e210992c5111a 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -233,7 +233,11 @@ function(create_libc_unittest fq_target_name)
   _get_common_test_compile_options(compile_options "${LIBC_UNITTEST_C_TEST}"
                                    "${LIBC_UNITTEST_FLAGS}")
   # TODO: Ideally we would have a separate function for link options.
-  set(link_options ${compile_options})
+  set(link_options
+    ${compile_options}
+    ${LIBC_LINK_OPTIONS_DEFAULT}
+    ${LIBC_TEST_LINK_OPTIONS_DEFAULT}
+  )
   list(APPEND compile_options ${LIBC_UNITTEST_COMPILE_OPTIONS})
 
   if(SHOW_INTERMEDIATE_OBJECTS)
@@ -580,12 +584,26 @@ function(add_integration_test test_name)
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
-    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+    set(link_options
+      -nolibc
+      -nostartfiles
+      -nostdlib++
+      -static
+      ${LIBC_LINK_OPTIONS_DEFAULT}
+      ${LIBC_TEST_LINK_OPTIONS_DEFAULT}
+    )
+    target_link_options(${fq_build_target_name} PRIVATE ${link_options})
   else()
     # Older version of gcc does not support `nostdlib++` flag.  We use
     # `nostdlib` and link against libgcc_s, which cannot be linked statically.
-    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
-    list(APPEND link_libraries ${LIBGCC_S_LOCATION})
+    set(link_options
+      -nolibc
+      -nostartfiles
+      -static
+      ${LIBC_LINK_OPTIONS_DEFAULT}
+      ${LIBC_TEST_LINK_OPTIONS_DEFAULT}
+    )
+    target_link_options(${fq_build_target_name} PRIVATE ${link_options})
   endif()
   target_link_libraries(
     ${fq_build_target_name}
@@ -774,11 +792,26 @@ function(add_libc_hermetic test_name)
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
-    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+    set(link_options
+      -nolibc
+      -nostartfiles
+      -nostdlib++
+      -static
+      ${LIBC_LINK_OPTIONS_DEFAULT}
+      ${LIBC_TEST_LINK_OPTIONS_DEFAULT}
+    )
+    target_link_options(${fq_build_target_name} PRIVATE ${link_options})
   else()
     # Older version of gcc does not support `nostdlib++` flag.  We use
     # `nostdlib` and link against libgcc_s, which cannot be linked statically.
-    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
+    set(link_options
+      -nolibc
+      -nostartfiles
+      -static
+      ${LIBC_LINK_OPTIONS_DEFAULT}
+      ${LIBC_TEST_LINK_OPTIONS_DEFAULT}
+    )
+    target_link_options(${fq_build_target_name} PRIVATE ${link_options})
     list(APPEND link_libraries ${LIBGCC_S_LOCATION})
   endif()
   target_link_libraries(
@@ -809,9 +842,16 @@ function(add_libc_hermetic test_name)
   endif()
 
   if(NOT HERMETIC_TEST_NO_RUN_POSTBUILD)
-    set(test_cmd ${HERMETIC_TEST_ENV}
+    if (LIBC_TEST_CMD)
+      # In the form of "<command> binary=@BINARY@", e.g. "qemu-system-arm -loader$<COMMA>file=@BINARY@"
+      string(REPLACE "@BINARY@" "$<TARGET_FILE:${fq_build_target_name}>" test_cmd_parsed ${LIBC_TEST_CMD})
+      string(REPLACE " " ";" test_cmd "${test_cmd_parsed}")
+    else()
+      set(test_cmd ${HERMETIC_TEST_ENV}
         $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
         $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
+    endif()
+
     add_custom_target(
       ${fq_target_name}
       DEPENDS ${fq_target_name}.__cmd__
@@ -863,7 +903,9 @@ function(add_libc_test test_name)
       # Tests like the file tests perform file operations on disk file. If we
       # don't chain up the unit test and hermetic test, then those tests will
       # step on each other's files.
-      add_dependencies(${fq_test_name}.__hermetic__ ${fq_test_name}.__unit__)
+      if(NOT LIBC_TEST_HERMETIC_ONLY)
+        add_dependencies(${fq_test_name}.__hermetic__ ${fq_test_name}.__unit__)
+      endif()
     endif()
   endif()
 endfunction(add_libc_test)
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index de7549c57ff44..80cd15eebc91f 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -278,6 +278,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # internal entrypoints
     libc.startup.baremetal.init
     libc.startup.baremetal.fini
diff --git a/libc/config/baremetal/arm/headers.txt b/libc/config/baremetal/arm/headers.txt
index 5666ef7e0012d..1f64afebdaaa7 100644
--- a/libc/config/baremetal/arm/headers.txt
+++ b/libc/config/baremetal/arm/headers.txt
@@ -23,4 +23,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.time
     libc.include.uchar
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 7e8c186d52469..c9f8118f6e800 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -278,6 +278,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # internal entrypoints
     libc.startup.baremetal.init
     libc.startup.baremetal.fini
diff --git a/libc/config/baremetal/riscv/headers.txt b/libc/config/baremetal/riscv/headers.txt
index 5666ef7e0012d..1f64afebdaaa7 100644
--- a/libc/config/baremetal/riscv/headers.txt
+++ b/libc/config/baremetal/riscv/headers.txt
@@ -23,4 +23,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.time
     libc.include.uchar
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/darwin/aarch64/entrypoints.txt b/libc/config/darwin/aarch64/entrypoints.txt
index 4674a9309115b..3bfdcdbee555e 100644
--- a/libc/config/darwin/aarch64/entrypoints.txt
+++ b/libc/config/darwin/aarch64/entrypoints.txt
@@ -99,6 +99,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.calloc
     libc.src.stdlib.realloc
     libc.src.stdlib.free
+
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/config/darwin/aarch64/headers.txt b/libc/config/darwin/aarch64/headers.txt
index 8f3d6029c9b6a..55a112c0c3ad3 100644
--- a/libc/config/darwin/aarch64/headers.txt
+++ b/libc/config/darwin/aarch64/headers.txt
@@ -11,4 +11,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.stdlib
     libc.include.string
     libc.include.strings
+    libc.include.wctype
 )
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index cff5b7f8312d6..b2abebee017d8 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -363,6 +363,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # sys/uio.h entrypoints
     libc.src.sys.uio.writev
     libc.src.sys.uio.readv
diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index 01b0bf36498ce..6d3bc9188583b 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.uchar
     libc.include.unistd
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index a1203cc4991af..5865dc93a9aef 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -191,6 +191,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # sys/time.h entrypoints
     libc.src.sys.time.setitimer
     libc.src.sys.time.getitimer
+
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/config/linux/arm/headers.txt b/libc/config/linux/arm/headers.txt
index 9aabac5dea33c..14c730e2b77b1 100644
--- a/libc/config/linux/arm/headers.txt
+++ b/libc/config/linux/arm/headers.txt
@@ -17,6 +17,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.strings
     libc.include.uchar
     libc.include.wchar
+    libc.include.wctype
 
     # Disabled due to epoll_wait syscalls not being available on this platform.
     # libc.include.sys_epoll
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 14361f5b6beff..79077a5e66ef5 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -368,6 +368,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
+
     # sys/uio.h entrypoints
     libc.src.sys.uio.writev
     libc.src.sys.uio.readv
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index 01b0bf36498ce..6d3bc9188583b 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.uchar
     libc.include.unistd
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 9223911f04a93..381359cec6f1d 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -396,6 +396,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wcstoul
     libc.src.wchar.wcstoull
 
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 
     # sys/uio.h entrypoints
     libc.src.sys.uio.writev
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 01b0bf36498ce..6d3bc9188583b 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.uchar
     libc.include.unistd
     libc.include.wchar
+    libc.include.wctype
 )
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 8898fd74c302f..18027298acc18 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -105,6 +105,9 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # unistd.h entrypoints
     libc.src.unistd.getentropy
+
+    # wctype.h entrypoints
+    libc.src.wctype.iswalpha
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/windows/headers.txt b/libc/config/windows/headers.txt
index 6d9aae9276924..d4a0947d867bb 100644
--- a/libc/config/windows/headers.txt
+++ b/libc/config/windows/headers.txt
@@ -7,4 +7,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.fenv
     libc.include.math
     libc.include.unistd
+    libc.include.wctype
 )
diff --git a/libc/docs/headers/search.rst b/libc/docs/headers/search.rst
index 51832e9bdc2ed..82c3496eeb0ed 100644
--- a/libc/docs/headers/search.rst
+++ b/libc/docs/headers/search.rst
@@ -29,7 +29,7 @@ Type Name                    Available
 ============================ =========
 ACTION                       |check|
 ENTRY                        |check|
-VISIT
+VISIT                        |check|
 ============================ =========
 
 POSIX Standard Functions
@@ -43,7 +43,7 @@ hdestroy                     |check|
 hsearch                      |check|
 insque                       |check|
 lfind                        |check|
-lsearch
+lsearch                      |check|
 remque                       |check|
 tdelete
 tfind
diff --git a/libc/fuzzing/math/CMakeLists.txt b/libc/fuzzing/math/CMakeLists.txt
index bf2be5c0b3cea..c1a93058764b3 100644
--- a/libc/fuzzing/math/CMakeLists.txt
+++ b/libc/fuzzing/math/CMakeLists.txt
@@ -134,15 +134,6 @@ add_libc_fuzzer(
     libc.src.math.cos
 )
 
-add_libc_fuzzer(
-  atan_fuzz
-  NEED_MPFR
-  SRCS
-    atan_fuzz.cpp
-  DEPENDS
-    libc.src.math.atan
-)
-
 add_libc_fuzzer(
   tan_fuzz
   NEED_MPFR
@@ -160,3 +151,48 @@ add_libc_fuzzer(
   DEPENDS
     libc.src.math.sincos
 )
+
+add_libc_fuzzer(
+  log_fuzz
+  NEED_MPFR
+  SRCS
+    log_fuzz.cpp
+  DEPENDS
+    libc.src.math.log
+)
+
+add_libc_fuzzer(
+  log10_fuzz
+  NEED_MPFR
+  SRCS
+    log10_fuzz.cpp
+  DEPENDS
+    libc.src.math.log10
+)
+
+add_libc_fuzzer(
+  log1p_fuzz
+  NEED_MPFR
+  SRCS
+    log1p_fuzz.cpp
+  DEPENDS
+    libc.src.math.log1p
+)
+
+add_libc_fuzzer(
+  log2_fuzz
+  NEED_MPFR
+  SRCS
+    log2_fuzz.cpp
+  DEPENDS
+    libc.src.math.log2
+)
+
+add_libc_fuzzer(
+  sqrt_fuzz
+  NEED_MPFR
+  SRCS
+    sqrt_fuzz.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.generic.sqrt
+)
diff --git a/libc/fuzzing/math/acos_fuzz.cpp b/libc/fuzzing/math/acos_fuzz.cpp
index d2b5456026839..48fb4eacc3a79 100644
--- a/libc/fuzzing/math/acos_fuzz.cpp
+++ b/libc/fuzzing/math/acos_fuzz.cpp
@@ -12,26 +12,40 @@
 
 #include "src/math/acos.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf and values outside accepted range
-  if (isnan(x) || isinf(x) || x > 1 || x < -1)
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_acos(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x > 1 || x < -1)
+      continue;
 
-  double result = LIBC_NAMESPACE::acos(x);
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_acos(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::acos(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/asin_fuzz.cpp b/libc/fuzzing/math/asin_fuzz.cpp
index 94ae5c7bfdeee..e27d179606824 100644
--- a/libc/fuzzing/math/asin_fuzz.cpp
+++ b/libc/fuzzing/math/asin_fuzz.cpp
@@ -12,26 +12,41 @@
 
 #include "src/math/asin.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf and values outside accepted range
-  if (isnan(x) || isinf(x) || x > 1 || x < -1)
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_asin(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::asin(x);
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x > 1 || x < -1)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_asin(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::asin(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/atan_fuzz.cpp b/libc/fuzzing/math/atan_fuzz.cpp
deleted file mode 100644
index 3b485786e3a63..0000000000000
--- a/libc/fuzzing/math/atan_fuzz.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- atan_fuzz.cpp -----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// Fuzzing test for llvm-libc atan implementation.
-///
-//===----------------------------------------------------------------------===//
-
-#include "src/math/atan.h"
-#include "utils/MPFRWrapper/mpfr_inc.h"
-#include <math.h>
-
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
-  mpfr_t input;
-  mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_atan(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
-
-  double result = LIBC_NAMESPACE::atan(x);
-
-  if (result != to_compare)
-    __builtin_trap();
-
-  mpfr_clear(input);
-  return 0;
-}
diff --git a/libc/fuzzing/math/cos_fuzz.cpp b/libc/fuzzing/math/cos_fuzz.cpp
index 5b5ba0f7de717..6ed1e9ed8f309 100644
--- a/libc/fuzzing/math/cos_fuzz.cpp
+++ b/libc/fuzzing/math/cos_fuzz.cpp
@@ -12,28 +12,43 @@
 
 #include "src/math/cos.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(const double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x))
-    return 0;
-  if (isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_cos(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::cos(x);
+    // remove NaN and inf as preconditions
+    if (isnan(x))
+      continue;
+    if (isinf(x))
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_cos(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::cos(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/exp10_fuzz.cpp b/libc/fuzzing/math/exp10_fuzz.cpp
index 2baef03a264a4..d939948b723a5 100644
--- a/libc/fuzzing/math/exp10_fuzz.cpp
+++ b/libc/fuzzing/math/exp10_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/exp10.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_exp10(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::exp10(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_exp10(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::exp10(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/exp2_fuzz.cpp b/libc/fuzzing/math/exp2_fuzz.cpp
index 8a2959047a6ca..a29d3c00da672 100644
--- a/libc/fuzzing/math/exp2_fuzz.cpp
+++ b/libc/fuzzing/math/exp2_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/exp2.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_exp2(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::exp2(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_exp2(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::exp2(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/exp_fuzz.cpp b/libc/fuzzing/math/exp_fuzz.cpp
index 97bc12dfa64c9..66823596dc6fa 100644
--- a/libc/fuzzing/math/exp_fuzz.cpp
+++ b/libc/fuzzing/math/exp_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/exp.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_exp(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::exp(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_exp(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::exp(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/expm1_fuzz.cpp b/libc/fuzzing/math/expm1_fuzz.cpp
index db507bb02b1d7..0690e449c3d23 100644
--- a/libc/fuzzing/math/expm1_fuzz.cpp
+++ b/libc/fuzzing/math/expm1_fuzz.cpp
@@ -12,27 +12,40 @@
 
 #include "src/math/expm1.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_expm1(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::expm1(x);
+    // remove NaN and inf
+    if (isnan(x) || isinf(x))
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_expm1(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
 
+    double result = LIBC_NAMESPACE::expm1(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
   mpfr_clear(input);
   return 0;
 }
diff --git a/libc/fuzzing/math/log10_fuzz.cpp b/libc/fuzzing/math/log10_fuzz.cpp
new file mode 100644
index 0000000000000..369408cc288b5
--- /dev/null
+++ b/libc/fuzzing/math/log10_fuzz.cpp
@@ -0,0 +1,51 @@
+//===-- log10_fuzz.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc log10 implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log10.h"
+#include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <math.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x < 0)
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_log10(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::log10(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/log1p_fuzz.cpp b/libc/fuzzing/math/log1p_fuzz.cpp
new file mode 100644
index 0000000000000..e02c61a352c1f
--- /dev/null
+++ b/libc/fuzzing/math/log1p_fuzz.cpp
@@ -0,0 +1,50 @@
+//===-- log1p_fuzz.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc log1p implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log1p.h"
+#include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <math.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x < -1)
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_log1p(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::log1p(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/log2_fuzz.cpp b/libc/fuzzing/math/log2_fuzz.cpp
new file mode 100644
index 0000000000000..c3e53c639cba9
--- /dev/null
+++ b/libc/fuzzing/math/log2_fuzz.cpp
@@ -0,0 +1,51 @@
+//===-- log2_fuzz.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc log2 implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log2.h"
+#include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <math.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x < 0)
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_log2(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::log2(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/log_fuzz.cpp b/libc/fuzzing/math/log_fuzz.cpp
new file mode 100644
index 0000000000000..9618accf3db26
--- /dev/null
+++ b/libc/fuzzing/math/log_fuzz.cpp
@@ -0,0 +1,50 @@
+//===-- log_fuzz.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc log implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log.h"
+#include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <math.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x < 0)
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_log(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::log(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/sin_fuzz.cpp b/libc/fuzzing/math/sin_fuzz.cpp
index a5f0fa95c1581..f6d59c7e496bc 100644
--- a/libc/fuzzing/math/sin_fuzz.cpp
+++ b/libc/fuzzing/math/sin_fuzz.cpp
@@ -12,28 +12,43 @@
 
 #include "src/math/sin.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(const double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x))
-    return 0;
-  if (isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_sin(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::sin(x);
+    // remove NaN and inf as preconditions
+    if (isnan(x))
+      continue;
+    if (isinf(x))
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_sin(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::sin(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/fuzzing/math/sincos_fuzz.cpp b/libc/fuzzing/math/sincos_fuzz.cpp
index 8cc6f7291a3df..3d3306721fc47 100644
--- a/libc/fuzzing/math/sincos_fuzz.cpp
+++ b/libc/fuzzing/math/sincos_fuzz.cpp
@@ -6,21 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// Fuzzing test for llvm-libc cos implementation.
+/// Fuzzing test for llvm-libc sincos implementation.
 ///
 //===----------------------------------------------------------------------===//
 
 #include "src/math/sincos.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x) || isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_t sin_x;
   mpfr_t cos_x;
@@ -28,21 +25,43 @@ extern "C" int LLVMFuzzerTestOneInput(double x) {
   mpfr_init2(input, 53);
   mpfr_init2(sin_x, 53);
   mpfr_init2(cos_x, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  mpfr_set_d(input, x, MPFR_RNDN);
+    // remove NaN and inf as preconditions
+    if (isnan(x) || isinf(x))
+      continue;
 
-  int output = mpfr_sin_cos(sin_x, cos_x, input, MPFR_RNDN);
-  mpfr_subnormalize(sin_x, output, MPFR_RNDN);
-  mpfr_subnormalize(cos_x, output, MPFR_RNDN);
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
 
-  double to_compare_sin = mpfr_get_d(sin_x, MPFR_RNDN);
-  double to_compare_cos = mpfr_get_d(cos_x, MPFR_RNDN);
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_sin_cos(sin_x, cos_x, input, MPFR_RNDN);
+    mpfr_subnormalize(sin_x, output, MPFR_RNDN);
+    mpfr_subnormalize(cos_x, output, MPFR_RNDN);
 
-  double sin_res, cos_res;
-  LIBC_NAMESPACE::sincos(x, &sin_res, &cos_res);
+    double to_compare_sin = mpfr_get_d(sin_x, MPFR_RNDN);
+    double to_compare_cos = mpfr_get_d(cos_x, MPFR_RNDN);
 
-  if (sin_res != to_compare_sin || cos_res != to_compare_cos)
-    __builtin_trap();
+    double sin_res, cos_res;
+    LIBC_NAMESPACE::sincos(x, &sin_res, &cos_res);
+
+    if (sin_res != to_compare_sin || cos_res != to_compare_cos) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing sin output: " << sin_res
+                << std::endl;
+      std::cout << std::hexfloat << "Expected sin: " << to_compare_sin
+                << std::endl;
+      std::cout << std::hexfloat << "Failing cos output: " << cos_res
+                << std::endl;
+      std::cout << std::hexfloat << "Expected cos: " << to_compare_cos
+                << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   mpfr_clear(sin_x);
diff --git a/libc/fuzzing/math/sqrt_fuzz.cpp b/libc/fuzzing/math/sqrt_fuzz.cpp
new file mode 100644
index 0000000000000..969b4f58e342c
--- /dev/null
+++ b/libc/fuzzing/math/sqrt_fuzz.cpp
@@ -0,0 +1,50 @@
+//===-- sqrt_fuzz.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc sqrt implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/generic/sqrt.h"
+#include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <math.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  mpfr_t input;
+  mpfr_init2(input, 53);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
+    // remove NaN and inf and values outside accepted range
+    if (isnan(x) || isinf(x) || x < 0)
+      continue;
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_sqrt(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::fputil::sqrt<double>(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
+  mpfr_clear(input);
+  return 0;
+}
diff --git a/libc/fuzzing/math/tan_fuzz.cpp b/libc/fuzzing/math/tan_fuzz.cpp
index 2a462fa34fce4..63d3b12866a0e 100644
--- a/libc/fuzzing/math/tan_fuzz.cpp
+++ b/libc/fuzzing/math/tan_fuzz.cpp
@@ -12,28 +12,43 @@
 
 #include "src/math/tan.h"
 #include "utils/MPFRWrapper/mpfr_inc.h"
+#include <cstdint>
+#include <cstring>
+#include <iostream>
 #include <math.h>
 
-extern "C" int LLVMFuzzerTestOneInput(const double x) {
-  // remove NaN and inf as preconditions
-  if (isnan(x))
-    return 0;
-  if (isinf(x))
-    return 0;
-  // signed zeros already tested in unit tests
-  if (signbit(x) && x == 0.0)
-    return 0;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   mpfr_t input;
   mpfr_init2(input, 53);
-  mpfr_set_d(input, x, MPFR_RNDN);
-  int output = mpfr_tan(input, input, MPFR_RNDN);
-  mpfr_subnormalize(input, output, MPFR_RNDN);
-  double to_compare = mpfr_get_d(input, MPFR_RNDN);
+  for (size_t i = 0; i < size / sizeof(double); ++i) {
+    double x;
+    std::memcpy(&x, data, sizeof(double));
+    data += sizeof(double);
 
-  double result = LIBC_NAMESPACE::tan(x);
+    // remove NaN and inf as preconditions
+    if (isnan(x))
+      continue;
+    if (isinf(x))
+      continue;
 
-  if (result != to_compare)
-    __builtin_trap();
+    // signed zeros already tested in unit tests
+    if (signbit(x) && x == 0.0)
+      continue;
+
+    mpfr_set_d(input, x, MPFR_RNDN);
+    int output = mpfr_tan(input, input, MPFR_RNDN);
+    mpfr_subnormalize(input, output, MPFR_RNDN);
+    double to_compare = mpfr_get_d(input, MPFR_RNDN);
+
+    double result = LIBC_NAMESPACE::tan(x);
+
+    if (result != to_compare) {
+      std::cout << std::hexfloat << "Failing input: " << x << std::endl;
+      std::cout << std::hexfloat << "Failing output: " << result << std::endl;
+      std::cout << std::hexfloat << "Expected: " << to_compare << std::endl;
+      __builtin_trap();
+    }
+  }
 
   mpfr_clear(input);
   return 0;
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 55268d19529c7..73213826ad607 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -69,7 +69,6 @@ add_header_macro(
   ../libc/include/dlfcn.yaml
   dlfcn.h
   DEPENDS
-    .llvm-libc-macros.dlfcn_macros
     .llvm_libc_common_h
 )
 
@@ -720,6 +719,15 @@ add_header_macro(
     .llvm-libc-types.wchar_t
 )
 
+add_header_macro(
+  wctype
+  ../libc/include/wctype.yaml
+  wctype.h
+  DEPENDS
+    .llvm_libc_common_h    
+    .llvm-libc-types.wint_t
+)
+
 add_header_macro(
   locale
   ../libc/include/locale.yaml
diff --git a/libc/include/dlfcn.yaml b/libc/include/dlfcn.yaml
index 78bbeff4e60d9..28be34dbd95bd 100644
--- a/libc/include/dlfcn.yaml
+++ b/libc/include/dlfcn.yaml
@@ -1,17 +1,34 @@
 header: dlfcn.h
-header_template: dlfcn.h.def
+standards:
+  - posix
 macros:
+  # Note that macro values are quoted to keep the integer literals as
+  # written.  Without the quotes, YAML will normalize them to minimal
+  # decimal, which is less readable for humans seeing the generated header.
   - macro_name: RTLD_LAZY
-    macro_header: dlfcn-macros.h
+    macro_value: "0x00001"
   - macro_name: RTLD_NOW
-    macro_header: dlfcn-macros.h
+    macro_value: "0x00002"
   - macro_name: RTLD_GLOBAL
-    macro_header: dlfcn-macros.h
+    macro_value: "0x00100"
   - macro_name: RTLD_LOCAL
-    macro_header: dlfcn-macros.h
-types: []
-enums: []
-objects: []
+    macro_value: "0"
+  - macro_name: RTLD_BINDING_MASK
+    standards:
+      - gnu
+    macro_value: "0x00003"
+  - macro_name: RTLD_NOLOAD
+    standards:
+      - gnu
+    macro_value: "0x00004"
+  - macro_name: RTLD_DEEPBIND
+    standards:
+      - gnu
+    macro_value: "0x00008"
+  - macro_name: RTLD_NODELETE
+    standards:
+      - gnu
+    macro_value: "0x01000"
 functions:
   - name: dlclose
     standards:
diff --git a/libc/include/llvm-libc-macros/dlfcn-macros.h b/libc/include/llvm-libc-macros/dlfcn-macros.h
deleted file mode 100644
index dcd202b9ab435..0000000000000
--- a/libc/include/llvm-libc-macros/dlfcn-macros.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- Definition of macros from dlfcn.h ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_MACROS_DLFCN_MACROS_H
-#define LLVM_LIBC_MACROS_DLFCN_MACROS_H
-
-#define RTLD_LAZY 0x00001
-#define RTLD_NOW 0x00002
-#define RTLD_GLOBAL 0x00100
-#define RTLD_LOCAL 0
-
-// Non-standard stuff here
-#define RTLD_BINDING_MASK 0x3
-#define RTLD_NOLOAD 0x00004
-#define RTLD_DEEPBIND 0x00008
-#define RTLD_NODELETE 0x01000
-
-#endif // LLVM_LIBC_MACROS_DLFCN_MACROS_H
diff --git a/libc/include/wctype.yaml b/libc/include/wctype.yaml
new file mode 100644
index 0000000000000..fb4f96f7d17e4
--- /dev/null
+++ b/libc/include/wctype.yaml
@@ -0,0 +1,10 @@
+header: wctype.h
+types:
+  - type_name: wint_t
+functions:
+  - name: iswalpha
+    standards:
+      - stdc
+    return_type: int
+    arguments:
+      - type: wint_t
diff --git a/libc/shared/math.h b/libc/shared/math.h
index b2f1a03e0940d..2ae7c1d58ae10 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -11,6 +11,9 @@
 
 #include "libc_common.h"
 
+#include "math/exp.h"
+#include "math/exp10.h"
+#include "math/exp10f.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp.h b/libc/shared/math/exp.h
new file mode 100644
index 0000000000000..7cdd6331e613a
--- /dev/null
+++ b/libc/shared/math/exp.h
@@ -0,0 +1,23 @@
+//===-- Shared exp function -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP_H
+#define LLVM_LIBC_SHARED_MATH_EXP_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP_H
diff --git a/libc/shared/math/exp10.h b/libc/shared/math/exp10.h
new file mode 100644
index 0000000000000..3d36d9103705f
--- /dev/null
+++ b/libc/shared/math/exp10.h
@@ -0,0 +1,23 @@
+//===-- Shared exp10 function -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10_H
+#define LLVM_LIBC_SHARED_MATH_EXP10_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp10.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10_H
diff --git a/libc/shared/math/exp10f.h b/libc/shared/math/exp10f.h
new file mode 100644
index 0000000000000..cd2ba54e6f4f2
--- /dev/null
+++ b/libc/shared/math/exp10f.h
@@ -0,0 +1,23 @@
+//===-- Shared exp10f function ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10F_H
+#define LLVM_LIBC_SHARED_MATH_EXP10F_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp10f.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10f;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10F_H
diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt
index a665253c4cc03..d7a1e1f49e6ff 100644
--- a/libc/src/CMakeLists.txt
+++ b/libc/src/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(strings)
 add_subdirectory(time)
 add_subdirectory(unistd)
 add_subdirectory(wchar)
+add_subdirectory(wctype)
 
 if(${LIBC_TARGET_OS} STREQUAL "linux")
   add_subdirectory(dirent)
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 294d68474bd53..37a27cc9b7007 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -406,7 +406,7 @@ add_subdirectory(time)
 
 # Requires access to uchar header which is not on macos
 # Therefore, cannot currently build this on macos in overlay mode
-if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+if(NOT (LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
 
diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h
index 41104620ed61d..7bec4e30a9960 100644
--- a/libc/src/__support/FPUtil/PolyEval.h
+++ b/libc/src/__support/FPUtil/PolyEval.h
@@ -37,7 +37,7 @@ LIBC_INLINE cpp::enable_if_t<(sizeof(T) <= sizeof(void *)), T> polyeval(T,
 }
 
 template <typename T, typename... Ts>
-LIBC_INLINE cpp::enable_if_t<(sizeof(T) > sizeof(void *)), T>
+LIBC_INLINE static constexpr cpp::enable_if_t<(sizeof(T) > sizeof(void *)), T>
 polyeval(const T &x, const T &a0, const Ts &...a) {
   return multiply_add(x, polyeval(x, a...), a0);
 }
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index c27885aadc028..8e54e845de493 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -151,8 +151,8 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
 }
 
 template <size_t SPLIT_B = 27>
-LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
-                                    const DoubleDouble &b) {
+LIBC_INLINE constexpr DoubleDouble quick_mult(const DoubleDouble &a,
+                                              const DoubleDouble &b) {
   DoubleDouble r = exact_mult<double, SPLIT_B>(a.hi, b.hi);
   double t1 = multiply_add(a.hi, b.lo, r.lo);
   double t2 = multiply_add(a.lo, b.hi, t1);
diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
index 8260702e2c9f4..1469326c9ba3e 100644
--- a/libc/src/__support/FPUtil/multiply_add.h
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -29,7 +29,7 @@ multiply_add(const T &x, const T &y, const T &z) {
 }
 
 template <typename T>
-LIBC_INLINE cpp::enable_if_t<(sizeof(T) <= sizeof(void *)), T>
+LIBC_INLINE static constexpr cpp::enable_if_t<(sizeof(T) <= sizeof(void *)), T>
 multiply_add(T x, T y, T z) {
   return x * y + z;
 }
diff --git a/libc/src/__support/FPUtil/nearest_integer.h b/libc/src/__support/FPUtil/nearest_integer.h
index 768f13414bd95..fabd309bd3520 100644
--- a/libc/src/__support/FPUtil/nearest_integer.h
+++ b/libc/src/__support/FPUtil/nearest_integer.h
@@ -40,7 +40,7 @@ namespace fputil {
 // Notice that for AARCH64 and x86-64 with SSE4.2 support, we will use their
 // corresponding rounding instruction instead.  And in those cases, the results
 // are rounded to the nearest integer, tie-to-even.
-LIBC_INLINE float nearest_integer(float x) {
+LIBC_INLINE static constexpr float nearest_integer(float x) {
   if (x < 0x1p24f && x > -0x1p24f) {
     float r = x < 0 ? (x - 0x1.0p23f) + 0x1.0p23f : (x + 0x1.0p23f) - 0x1.0p23f;
     float diff = x - r;
@@ -56,7 +56,7 @@ LIBC_INLINE float nearest_integer(float x) {
   return x;
 }
 
-LIBC_INLINE double nearest_integer(double x) {
+LIBC_INLINE static constexpr double nearest_integer(double x) {
   if (x < 0x1p53 && x > -0x1p53) {
     double r = x < 0 ? (x - 0x1.0p52) + 0x1.0p52 : (x + 0x1.0p52) - 0x1.0p52;
     double diff = x - r;
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 900c0ab04d3a3..ad36679409f89 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -110,3 +110,91 @@ add_header_library(
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
 )
+
+add_header_library(
+  exp_constants
+  HDRS
+    exp_constants.h
+  DEPENDS
+    libc.src.__support.FPUtil.triple_double
+)
+
+add_header_library(
+  exp_utils
+  HDRS
+    exp_utils.h
+  DEPENDS
+    libc.src.__support.CPP.optional
+    libc.src.__support.CPP.bit
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_header_library(
+  exp
+  HDRS
+    exp.h
+  DEPENDS
+    .exp_constants
+    .exp_utils
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.optional
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.triple_double
+    libc.src.__support.integer_literals
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  exp10
+  HDRS
+    exp10.h
+  DEPENDS
+    .exp_constants
+    .exp_utils
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.optional
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.triple_double
+    libc.src.__support.integer_literals
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  exp10f_utils
+  HDRS
+    exp10f_utils.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.common
+    libc.src.__support.math.exp_utils
+)
+
+add_header_library(
+  exp10f
+  HDRS
+    exp10f.h
+  DEPENDS
+    .exp10f_utils
+    libc.src.__support.macros.config
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
diff --git a/libc/src/__support/math/exp.h b/libc/src/__support/math/exp.h
new file mode 100644
index 0000000000000..a538df1e825dc
--- /dev/null
+++ b/libc/src/__support/math/exp.h
@@ -0,0 +1,448 @@
+//===-- Implementation header for exp ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_H
+
+#include "exp_constants.h"
+#include "exp_utils.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/triple_double.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+using fputil::DoubleDouble;
+using fputil::TripleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+using LIBC_NAMESPACE::operator""_u128;
+
+// log2(e)
+static constexpr double LOG2_E = 0x1.71547652b82fep+0;
+
+// Error bounds:
+// Errors when using double precision.
+static constexpr double ERR_D = 0x1.8p-63;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Errors when using double-double precision.
+static constexpr double ERR_DD = 0x1.0p-99;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// -2^-12 * log(2)
+// > a = -2^-12 * log(2);
+// > b = round(a, 30, RN);
+// > c = round(a - b, 30, RN);
+// > d = round(a - b - c, D, RN);
+// Errors < 1.5 * 2^-133
+static constexpr double MLOG_2_EXP2_M12_HI = -0x1.62e42ffp-13;
+static constexpr double MLOG_2_EXP2_M12_MID = 0x1.718432a1b0e26p-47;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+static constexpr double MLOG_2_EXP2_M12_MID_30 = 0x1.718432ap-47;
+static constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+namespace {
+
+// Polynomial approximations with double precision:
+// Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
+// For |dx| < 2^-13 + 2^-30:
+//   | output - expm1(dx) / dx | < 2^-51.
+static double poly_approx_d(double dx) {
+  // dx^2
+  double dx2 = dx * dx;
+  // c0 = 1 + dx / 2
+  double c0 = fputil::multiply_add(dx, 0.5, 1.0);
+  // c1 = 1/6 + dx / 24
+  double c1 =
+      fputil::multiply_add(dx, 0x1.5555555555555p-5, 0x1.5555555555555p-3);
+  // p = dx^2 * c1 + c0 = 1 + dx / 2 + dx^2 / 6 + dx^3 / 24
+  double p = fputil::multiply_add(dx2, c1, c0);
+  return p;
+}
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Polynomial approximation with double-double precision:
+// Return exp(dx) ~ 1 + dx + dx^2 / 2 + ... + dx^6 / 720
+// For |dx| < 2^-13 + 2^-30:
+//   | output - exp(dx) | < 2^-101
+static DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
+  // Taylor polynomial.
+  constexpr DoubleDouble COEFFS[] = {
+      {0, 0x1p0},                                      // 1
+      {0, 0x1p0},                                      // 1
+      {0, 0x1p-1},                                     // 1/2
+      {0x1.5555555555555p-57, 0x1.5555555555555p-3},   // 1/6
+      {0x1.5555555555555p-59, 0x1.5555555555555p-5},   // 1/24
+      {0x1.1111111111111p-63, 0x1.1111111111111p-7},   // 1/120
+      {-0x1.f49f49f49f49fp-65, 0x1.6c16c16c16c17p-10}, // 1/720
+  };
+
+  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
+                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
+  return p;
+}
+
+// Polynomial approximation with 128-bit precision:
+// Return exp(dx) ~ 1 + dx + dx^2 / 2 + ... + dx^7 / 5040
+// For |dx| < 2^-13 + 2^-30:
+//   | output - exp(dx) | < 2^-126.
+static Float128 poly_approx_f128(const Float128 &dx) {
+  constexpr Float128 COEFFS_128[]{
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+      {Sign::POS, -128, 0x80000000'00000000'00000000'00000000_u128}, // 0.5
+      {Sign::POS, -130, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128}, // 1/6
+      {Sign::POS, -132, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128}, // 1/24
+      {Sign::POS, -134, 0x88888888'88888888'88888888'88888889_u128}, // 1/120
+      {Sign::POS, -137, 0xb60b60b6'0b60b60b'60b60b60'b60b60b6_u128}, // 1/720
+      {Sign::POS, -140, 0xd00d00d0'0d00d00d'00d00d00'd00d00d0_u128}, // 1/5040
+  };
+
+  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
+                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
+                                COEFFS_128[6], COEFFS_128[7]);
+  return p;
+}
+
+// Compute exp(x) using 128-bit precision.
+// TODO(lntue): investigate triple-double precision implementation for this
+// step.
+static Float128 exp_f128(double x, double kd, int idx1, int idx2) {
+  // Recalculate dx:
+
+  double t1 = fputil::multiply_add(kd, MLOG_2_EXP2_M12_HI, x); // exact
+  double t2 = kd * MLOG_2_EXP2_M12_MID_30;                     // exact
+  double t3 = kd * MLOG_2_EXP2_M12_LO;                         // Error < 2^-133
+
+  Float128 dx = fputil::quick_add(
+      Float128(t1), fputil::quick_add(Float128(t2), Float128(t3)));
+
+  // TODO: Skip recalculating exp_mid1 and exp_mid2.
+  Float128 exp_mid1 =
+      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
+                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
+                                          Float128(EXP2_MID1[idx1].lo)));
+
+  Float128 exp_mid2 =
+      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
+                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
+                                          Float128(EXP2_MID2[idx2].lo)));
+
+  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
+
+  Float128 p = poly_approx_f128(dx);
+
+  Float128 r = fputil::quick_mul(exp_mid, p);
+
+  r.exponent += static_cast<int>(kd) >> 12;
+
+  return r;
+}
+
+// Compute exp(x) with double-double precision.
+static DoubleDouble exp_double_double(double x, double kd,
+                                      const DoubleDouble &exp_mid) {
+  // Recalculate dx:
+  //   dx = x - k * 2^-12 * log(2)
+  double t1 = fputil::multiply_add(kd, MLOG_2_EXP2_M12_HI, x); // exact
+  double t2 = kd * MLOG_2_EXP2_M12_MID_30;                     // exact
+  double t3 = kd * MLOG_2_EXP2_M12_LO;                         // Error < 2^-130
+
+  DoubleDouble dx = fputil::exact_add(t1, t2);
+  dx.lo += t3;
+
+  // Degree-6 Taylor polynomial approximation in double-double precision.
+  // | p - exp(x) | < 2^-100.
+  DoubleDouble p = poly_approx_dd(dx);
+
+  // Error bounds: 2^-99.
+  DoubleDouble r = fputil::quick_mult(exp_mid, p);
+
+  return r;
+}
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Check for exceptional cases when
+// |x| <= 2^-53 or x < log(2^-1075) or x >= 0x1.6232bdd7abcd3p+9
+static double set_exceptional(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+  uint64_t x_abs = xbits.abs().uintval();
+
+  // |x| <= 2^-53
+  if (x_abs <= 0x3ca0'0000'0000'0000ULL) {
+    // exp(x) ~ 1 + x
+    return 1 + x;
+  }
+
+  // x <= log(2^-1075) || x >= 0x1.6232bdd7abcd3p+9 or inf/nan.
+
+  // x <= log(2^-1075) or -inf/nan
+  if (x_u >= 0xc087'4910'd52d'3052ULL) {
+    // exp(-Inf) = 0
+    if (xbits.is_inf())
+      return 0.0;
+
+    // exp(nan) = nan
+    if (xbits.is_nan())
+      return x;
+
+    if (fputil::quick_get_round() == FE_UPWARD)
+      return FPBits::min_subnormal().get_val();
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_UNDERFLOW);
+    return 0.0;
+  }
+
+  // x >= round(log(MAX_NORMAL), D, RU) = 0x1.62e42fefa39fp+9 or +inf/nan
+  // x is finite
+  if (x_u < 0x7ff0'0000'0000'0000ULL) {
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+      return FPBits::max_normal().get_val();
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_OVERFLOW);
+  }
+  // x is +inf or nan
+  return x + FPBits::inf().get_val();
+}
+
+} // namespace
+
+namespace math {
+
+static double exp(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+
+  // Upper bound: max normal number = 2^1023 * (2 - 2^-52)
+  // > round(log (2^1023 ( 2 - 2^-52 )), D, RU) = 0x1.62e42fefa39fp+9
+  // > round(log (2^1023 ( 2 - 2^-52 )), D, RD) = 0x1.62e42fefa39efp+9
+  // > round(log (2^1023 ( 2 - 2^-52 )), D, RN) = 0x1.62e42fefa39efp+9
+  // > round(exp(0x1.62e42fefa39fp+9), D, RN) = infty
+
+  // Lower bound: min denormal number / 2 = 2^-1075
+  // > round(log(2^-1075), D, RN) = -0x1.74910d52d3052p9
+
+  // Another lower bound: min normal number = 2^-1022
+  // > round(log(2^-1022), D, RN) = -0x1.6232bdd7abcd2p9
+
+  // x < log(2^-1075) or x >= 0x1.6232bdd7abcd3p+9 or |x| < 2^-53.
+  if (LIBC_UNLIKELY(x_u >= 0xc0874910d52d3052 ||
+                    (x_u < 0xbca0000000000000 && x_u >= 0x40862e42fefa39f0) ||
+                    x_u < 0x3ca0000000000000)) {
+    return set_exceptional(x);
+  }
+
+  // Now log(2^-1075) <= x <= -2^-53 or 2^-53 <= x < log(2^1023 * (2 - 2^-52))
+
+  // Range reduction:
+  // Let x = log(2) * (hi + mid1 + mid2) + lo
+  // in which:
+  //   hi is an integer
+  //   mid1 * 2^6 is an integer
+  //   mid2 * 2^12 is an integer
+  // then:
+  //   exp(x) = 2^hi * 2^(mid1) * 2^(mid2) * exp(lo).
+  // With this formula:
+  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
+  //     field.
+  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
+  //   - exp(lo) ~ 1 + lo + a0 * lo^2 + ...
+  //
+  // They can be defined by:
+  //   hi + mid1 + mid2 = 2^(-12) * round(2^12 * log_2(e) * x)
+  // If we store L2E = round(log2(e), D, RN), then:
+  //   log2(e) - L2E ~ 1.5 * 2^(-56)
+  // So the errors when computing in double precision is:
+  //   | x * 2^12 * log_2(e) - D(x * 2^12 * L2E) | <=
+  //  <= | x * 2^12 * log_2(e) - x * 2^12 * L2E | +
+  //     + | x * 2^12 * L2E - D(x * 2^12 * L2E) |
+  //  <= 2^12 * ( |x| * 1.5 * 2^-56 + eps(x))  for RN
+  //     2^12 * ( |x| * 1.5 * 2^-56 + 2*eps(x)) for other rounding modes.
+  // So if:
+  //   hi + mid1 + mid2 = 2^(-12) * round(x * 2^12 * L2E) is computed entirely
+  // in double precision, the reduced argument:
+  //   lo = x - log(2) * (hi + mid1 + mid2) is bounded by:
+  //   |lo| <= 2^-13 + (|x| * 1.5 * 2^-56 + 2*eps(x))
+  //         < 2^-13 + (1.5 * 2^9 * 1.5 * 2^-56 + 2*2^(9 - 52))
+  //         < 2^-13 + 2^-41
+  //
+
+  // The following trick computes the round(x * L2E) more efficiently
+  // than using the rounding instructions, with the tradeoff for less accuracy,
+  // and hence a slightly larger range for the reduced argument `lo`.
+  //
+  // To be precise, since |x| < |log(2^-1075)| < 1.5 * 2^9,
+  //   |x * 2^12 * L2E| < 1.5 * 2^9 * 1.5 < 2^23,
+  // So we can fit the rounded result round(x * 2^12 * L2E) in int32_t.
+  // Thus, the goal is to be able to use an additional addition and fixed width
+  // shift to get an int32_t representing round(x * 2^12 * L2E).
+  //
+  // Assuming int32_t using 2-complement representation, since the mantissa part
+  // of a double precision is unsigned with the leading bit hidden, if we add an
+  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
+  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
+  // considered as a proper 2-complement representations of x*2^12*L2E.
+  //
+  // One small problem with this approach is that the sum (x*2^12*L2E + C) in
+  // double precision is rounded to the least significant bit of the dorminant
+  // factor C.  In order to minimize the rounding errors from this addition, we
+  // want to minimize e1.  Another constraint that we want is that after
+  // shifting the mantissa so that the least significant bit of int32_t
+  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
+  // any adjustment.  So combining these 2 requirements, we can choose
+  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
+  // after right shifting the mantissa, the resulting int32_t has correct sign.
+  // With this choice of C, the number of mantissa bits we need to shift to the
+  // right is: 52 - 33 = 19.
+  //
+  // Moreover, since the integer right shifts are equivalent to rounding down,
+  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
+  // +infinity.  So in particular, we can compute:
+  //   hmm = x * 2^12 * L2E + C,
+  // where C = 2^33 + 2^32 + 2^-1, then if
+  //   k = int32_t(lower 51 bits of double(x * 2^12 * L2E + C) >> 19),
+  // the reduced argument:
+  //   lo = x - log(2) * 2^-12 * k is bounded by:
+  //   |lo| <= 2^-13 + 2^-41 + 2^-12*2^-19
+  //         = 2^-13 + 2^-31 + 2^-41.
+  //
+  // Finally, notice that k only uses the mantissa of x * 2^12 * L2E, so the
+  // exponent 2^12 is not needed.  So we can simply define
+  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
+  //   k = int32_t(lower 51 bits of double(x * L2E + C) >> 19).
+
+  // Rounding errors <= 2^-31 + 2^-41.
+  double tmp = fputil::multiply_add(x, LOG2_E, 0x1.8000'0000'4p21);
+  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+  int hi = k >> 12;
+
+  bool denorm = (hi <= -1022);
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |x - (hi + mid1 + mid2) * log(2) - dx| < 2^11 * eps(M_LOG_2_EXP2_M12.lo)
+  //                                        = 2^11 * 2^-13 * 2^-52
+  //                                        = 2^-54.
+  // |dx| < 2^-13 + 2^-30.
+  double lo_h = fputil::multiply_add(kd, MLOG_2_EXP2_M12_HI, x); // exact
+  double dx = fputil::multiply_add(kd, MLOG_2_EXP2_M12_MID, lo_h);
+
+  // We use the degree-4 Taylor polynomial to approximate exp(lo):
+  //   exp(lo) ~ 1 + lo + lo^2 / 2 + lo^3 / 6 + lo^4 / 24 = 1 + lo * P(lo)
+  // So that the errors are bounded by:
+  //   |P(lo) - expm1(lo)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
+  // Let P_ be an evaluation of P where all intermediate computations are in
+  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
+  // errors can be bounded by:
+  //      |P_(dx) - P(dx)| < 2^-51
+  //   => |dx * P_(dx) - expm1(lo) | < 1.5 * 2^-64
+  //   => 2^(mid1 + mid2) * |dx * P_(dx) - expm1(lo)| < 1.5 * 2^-63.
+  // Since we approximate
+  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
+  // We use the expression:
+  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
+  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
+  // with errors bounded by 1.5 * 2^-63.
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate expm1(dx)/dx ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
+  double p = poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (LIBC_UNLIKELY(denorm)) {
+    return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
+        .value();
+  } else {
+    // to multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r =
+        cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
+    return r;
+  }
+#else
+  if (LIBC_UNLIKELY(denorm)) {
+    if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
+        LIBC_LIKELY(r.has_value()))
+      return r.value();
+  } else {
+    double upper = exp_mid.hi + (lo + ERR_D);
+    double lower = exp_mid.hi + (lo - ERR_D);
+
+    if (LIBC_LIKELY(upper == lower)) {
+      // to multiply by 2^hi, a fast way is to simply add hi to the exponent
+      // field.
+      int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+      double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
+      return r;
+    }
+  }
+
+  // Use double-double
+  DoubleDouble r_dd = exp_double_double(x, kd, exp_mid);
+
+  if (LIBC_UNLIKELY(denorm)) {
+    if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
+        LIBC_LIKELY(r.has_value()))
+      return r.value();
+  } else {
+    double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
+    double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
+
+    if (LIBC_LIKELY(upper_dd == lower_dd)) {
+      int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+      double r =
+          cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
+      return r;
+    }
+  }
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp_f128(x, kd, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_H
diff --git a/libc/src/__support/math/exp10.h b/libc/src/__support/math/exp10.h
new file mode 100644
index 0000000000000..88748523deb3d
--- /dev/null
+++ b/libc/src/__support/math/exp10.h
@@ -0,0 +1,501 @@
+//===-- Implementation header for exp10 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H
+
+#include "exp_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
+#include "exp_utils.h"     // ziv_test_denorm.
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/triple_double.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+using fputil::DoubleDouble;
+using fputil::TripleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+using LIBC_NAMESPACE::operator""_u128;
+
+// log2(10)
+static constexpr double LOG2_10 = 0x1.a934f0979a371p+1;
+
+// -2^-12 * log10(2)
+// > a = -2^-12 * log10(2);
+// > b = round(a, 32, RN);
+// > c = round(a - b, 32, RN);
+// > d = round(a - b - c, D, RN);
+// Errors < 1.5 * 2^-144
+static constexpr double MLOG10_2_EXP2_M12_HI = -0x1.3441350ap-14;
+static constexpr double MLOG10_2_EXP2_M12_MID = 0x1.0c0219dc1da99p-51;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+static constexpr double MLOG10_2_EXP2_M12_MID_32 = 0x1.0c0219dcp-51;
+static constexpr double MLOG10_2_EXP2_M12_LO = 0x1.da994fd20dba2p-87;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Error bounds:
+// Errors when using double precision.
+constexpr double ERR_D = 0x1.8p-63;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Errors when using double-double precision.
+static constexpr double ERR_DD = 0x1.8p-99;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Polynomial approximations with double precision.  Generated by Sollya with:
+// > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]);
+// > P;
+// Error bounds:
+//   | output - (10^dx - 1) / dx | < 2^-52.
+LIBC_INLINE static double exp10_poly_approx_d(double dx) {
+  // dx^2
+  double dx2 = dx * dx;
+  double c0 =
+      fputil::multiply_add(dx, 0x1.53524c73cea6ap+1, 0x1.26bb1bbb55516p+1);
+  double c1 =
+      fputil::multiply_add(dx, 0x1.2bd75cc6afc65p+0, 0x1.0470587aa264cp+1);
+  double p = fputil::multiply_add(dx2, c1, c0);
+  return p;
+}
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Polynomial approximation with double-double precision.  Generated by Solya
+// with:
+// > P = fpminimax((10^x - 1)/x, 5, [|DD...|], [-2^-14, 2^-14]);
+// Error bounds:
+//   | output - 10^(dx) | < 2^-101
+static constexpr DoubleDouble exp10_poly_approx_dd(const DoubleDouble &dx) {
+  // Taylor polynomial.
+  constexpr DoubleDouble COEFFS[] = {
+      {0, 0x1p0},
+      {-0x1.f48ad494e927bp-53, 0x1.26bb1bbb55516p1},
+      {-0x1.e2bfab3191cd2p-53, 0x1.53524c73cea69p1},
+      {0x1.80fb65ec3b503p-53, 0x1.0470591de2ca4p1},
+      {0x1.338fc05e21e55p-54, 0x1.2bd7609fd98c4p0},
+      {0x1.d4ea116818fbp-56, 0x1.1429ffd519865p-1},
+      {-0x1.872a8ff352077p-57, 0x1.a7ed70847c8b3p-3},
+
+  };
+
+  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
+                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
+  return p;
+}
+
+// Polynomial approximation with 128-bit precision:
+// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
+// For |dx| < 2^-14:
+//   | output - 10^dx | < 1.5 * 2^-124.
+static constexpr Float128 exp10_poly_approx_f128(const Float128 &dx) {
+  constexpr Float128 COEFFS_128[]{
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+      {Sign::POS, -126, 0x935d8ddd'aaa8ac16'ea56d62b'82d30a2d_u128},
+      {Sign::POS, -126, 0xa9a92639'e753443a'80a99ce7'5f4d5bdb_u128},
+      {Sign::POS, -126, 0x82382c8e'f1652304'6a4f9d7d'bf6c9635_u128},
+      {Sign::POS, -124, 0x12bd7609'fd98c44c'34578701'9216c7af_u128},
+      {Sign::POS, -127, 0x450a7ff4'7535d889'cc41ed7e'0d27aee5_u128},
+      {Sign::POS, -130, 0xd3f6b844'702d636b'8326bb91'a6e7601d_u128},
+      {Sign::POS, -130, 0x45b937f0'd05bb1cd'fa7b46df'314112a9_u128},
+  };
+
+  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
+                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
+                                COEFFS_128[6], COEFFS_128[7]);
+  return p;
+}
+
+// Compute 10^(x) using 128-bit precision.
+// TODO(lntue): investigate triple-double precision implementation for this
+// step.
+static Float128 exp10_f128(double x, double kd, int idx1, int idx2) {
+  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
+  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-144
+
+  Float128 dx = fputil::quick_add(
+      Float128(t1), fputil::quick_add(Float128(t2), Float128(t3)));
+
+  // TODO: Skip recalculating exp_mid1 and exp_mid2.
+  Float128 exp_mid1 =
+      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
+                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
+                                          Float128(EXP2_MID1[idx1].lo)));
+
+  Float128 exp_mid2 =
+      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
+                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
+                                          Float128(EXP2_MID2[idx2].lo)));
+
+  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
+
+  Float128 p = exp10_poly_approx_f128(dx);
+
+  Float128 r = fputil::quick_mul(exp_mid, p);
+
+  r.exponent += static_cast<int>(kd) >> 12;
+
+  return r;
+}
+
+// Compute 10^x with double-double precision.
+static DoubleDouble exp10_double_double(double x, double kd,
+                                        const DoubleDouble &exp_mid) {
+  // Recalculate dx:
+  //   dx = x - k * 2^-12 * log10(2)
+  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
+  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-140
+
+  DoubleDouble dx = fputil::exact_add(t1, t2);
+  dx.lo += t3;
+
+  // Degree-6 polynomial approximation in double-double precision.
+  // | p - 10^x | < 2^-103.
+  DoubleDouble p = exp10_poly_approx_dd(dx);
+
+  // Error bounds: 2^-102.
+  DoubleDouble r = fputil::quick_mult(exp_mid, p);
+
+  return r;
+}
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// When output is denormal.
+static double exp10_denorm(double x) {
+  // Range reduction.
+  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
+  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
+  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = exp10_poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
+      .value();
+#else
+  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use double-double
+  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
+
+  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+// Check for exceptional cases when:
+//  * log10(1 - 2^-54) < x < log10(1 + 2^-53)
+//  * x >= log10(2^1024)
+//  * x <= log10(2^-1022)
+//  * x is inf or nan
+static constexpr double exp10_set_exceptional(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+  uint64_t x_abs = xbits.abs().uintval();
+
+  // |x| < log10(1 + 2^-53)
+  if (x_abs <= 0x3c8bcb7b1526e50e) {
+    // 10^(x) ~ 1 + x/2
+    return fputil::multiply_add(x, 0.5, 1.0);
+  }
+
+  // x <= log10(2^-1022) || x >= log10(2^1024) or inf/nan.
+  if (x_u >= 0xc0733a7146f72a42) {
+    // x <= log10(2^-1075) or -inf/nan
+    if (x_u > 0xc07439b746e36b52) {
+      // exp(-Inf) = 0
+      if (xbits.is_inf())
+        return 0.0;
+
+      // exp(nan) = nan
+      if (xbits.is_nan())
+        return x;
+
+      if (fputil::quick_get_round() == FE_UPWARD)
+        return FPBits::min_subnormal().get_val();
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW);
+      return 0.0;
+    }
+
+    return exp10_denorm(x);
+  }
+
+  // x >= log10(2^1024) or +inf/nan
+  // x is finite
+  if (x_u < 0x7ff0'0000'0000'0000ULL) {
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+      return FPBits::max_normal().get_val();
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_OVERFLOW);
+  }
+  // x is +inf or nan
+  return x + FPBits::inf().get_val();
+}
+
+namespace math {
+
+static constexpr double exp10(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+
+  // x <= log10(2^-1022) or x >= log10(2^1024) or
+  // log10(1 - 2^-54) < x < log10(1 + 2^-53).
+  if (LIBC_UNLIKELY(x_u >= 0xc0733a7146f72a42 ||
+                    (x_u <= 0xbc7bcb7b1526e50e && x_u >= 0x40734413509f79ff) ||
+                    x_u < 0x3c8bcb7b1526e50e)) {
+    return exp10_set_exceptional(x);
+  }
+
+  // Now log10(2^-1075) < x <= log10(1 - 2^-54) or
+  //     log10(1 + 2^-53) < x < log10(2^1024)
+
+  // Range reduction:
+  // Let x = log10(2) * (hi + mid1 + mid2) + lo
+  // in which:
+  //   hi is an integer
+  //   mid1 * 2^6 is an integer
+  //   mid2 * 2^12 is an integer
+  // then:
+  //   10^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 10^(lo).
+  // With this formula:
+  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
+  //     field.
+  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
+  //   - 10^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
+  //
+  // We compute (hi + mid1 + mid2) together by perform the rounding on
+  //   x * log2(10) * 2^12.
+  // Since |x| < |log10(2^-1075)| < 2^9,
+  //   |x * 2^12| < 2^9 * 2^12 < 2^21,
+  // So we can fit the rounded result round(x * 2^12) in int32_t.
+  // Thus, the goal is to be able to use an additional addition and fixed width
+  // shift to get an int32_t representing round(x * 2^12).
+  //
+  // Assuming int32_t using 2-complement representation, since the mantissa part
+  // of a double precision is unsigned with the leading bit hidden, if we add an
+  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^23 to the product, the
+  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
+  // considered as a proper 2-complement representations of x*2^12.
+  //
+  // One small problem with this approach is that the sum (x*2^12 + C) in
+  // double precision is rounded to the least significant bit of the dorminant
+  // factor C.  In order to minimize the rounding errors from this addition, we
+  // want to minimize e1.  Another constraint that we want is that after
+  // shifting the mantissa so that the least significant bit of int32_t
+  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
+  // any adjustment.  So combining these 2 requirements, we can choose
+  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
+  // after right shifting the mantissa, the resulting int32_t has correct sign.
+  // With this choice of C, the number of mantissa bits we need to shift to the
+  // right is: 52 - 33 = 19.
+  //
+  // Moreover, since the integer right shifts are equivalent to rounding down,
+  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
+  // +infinity.  So in particular, we can compute:
+  //   hmm = x * 2^12 + C,
+  // where C = 2^33 + 2^32 + 2^-1, then if
+  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
+  // the reduced argument:
+  //   lo = x - log10(2) * 2^-12 * k is bounded by:
+  //   |lo|  = |x - log10(2) * 2^-12 * k|
+  //         = log10(2) * 2^-12 * | x * log2(10) * 2^12 - k |
+  //        <= log10(2) * 2^-12 * (2^-1 + 2^-19)
+  //         < 1.5 * 2^-2 * (2^-13 + 2^-31)
+  //         = 1.5 * (2^-15 * 2^-31)
+  //
+  // Finally, notice that k only uses the mantissa of x * 2^12, so the
+  // exponent 2^12 is not needed.  So we can simply define
+  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
+  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
+
+  // Rounding errors <= 2^-31.
+  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
+  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
+  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
+  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
+
+  // We use the degree-4 polynomial to approximate 10^(lo):
+  //   10^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4
+  //           = 1 + lo * P(lo)
+  // So that the errors are bounded by:
+  //   |P(lo) - (10^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
+  // Let P_ be an evaluation of P where all intermediate computations are in
+  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
+  // errors can be bounded by:
+  //      |P_(lo) - P(lo)| < 2^-51
+  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-65
+  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-64.
+  // Since we approximate
+  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
+  // We use the expression:
+  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
+  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
+  // with errors bounded by 2^-64.
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = exp10_poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+  double r =
+      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
+  return r;
+#else
+  double upper = exp_mid.hi + (lo + ERR_D);
+  double lower = exp_mid.hi + (lo - ERR_D);
+
+  if (LIBC_LIKELY(upper == lower)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
+    return r;
+  }
+
+  // Exact outputs when x = 1, 2, ..., 22 + hard to round with x = 23.
+  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0 | ... | bits of 23.0)
+  if (LIBC_UNLIKELY((x_u & 0x8000'ffff'ffff'ffffULL) == 0ULL)) {
+    switch (x_u) {
+    case 0x3ff0000000000000: // x = 1.0
+      return 10.0;
+    case 0x4000000000000000: // x = 2.0
+      return 100.0;
+    case 0x4008000000000000: // x = 3.0
+      return 1'000.0;
+    case 0x4010000000000000: // x = 4.0
+      return 10'000.0;
+    case 0x4014000000000000: // x = 5.0
+      return 100'000.0;
+    case 0x4018000000000000: // x = 6.0
+      return 1'000'000.0;
+    case 0x401c000000000000: // x = 7.0
+      return 10'000'000.0;
+    case 0x4020000000000000: // x = 8.0
+      return 100'000'000.0;
+    case 0x4022000000000000: // x = 9.0
+      return 1'000'000'000.0;
+    case 0x4024000000000000: // x = 10.0
+      return 10'000'000'000.0;
+    case 0x4026000000000000: // x = 11.0
+      return 100'000'000'000.0;
+    case 0x4028000000000000: // x = 12.0
+      return 1'000'000'000'000.0;
+    case 0x402a000000000000: // x = 13.0
+      return 10'000'000'000'000.0;
+    case 0x402c000000000000: // x = 14.0
+      return 100'000'000'000'000.0;
+    case 0x402e000000000000: // x = 15.0
+      return 1'000'000'000'000'000.0;
+    case 0x4030000000000000: // x = 16.0
+      return 10'000'000'000'000'000.0;
+    case 0x4031000000000000: // x = 17.0
+      return 100'000'000'000'000'000.0;
+    case 0x4032000000000000: // x = 18.0
+      return 1'000'000'000'000'000'000.0;
+    case 0x4033000000000000: // x = 19.0
+      return 10'000'000'000'000'000'000.0;
+    case 0x4034000000000000: // x = 20.0
+      return 100'000'000'000'000'000'000.0;
+    case 0x4035000000000000: // x = 21.0
+      return 1'000'000'000'000'000'000'000.0;
+    case 0x4036000000000000: // x = 22.0
+      return 10'000'000'000'000'000'000'000.0;
+    case 0x4037000000000000: // x = 23.0
+      return 0x1.52d02c7e14af6p76 + x;
+    }
+  }
+
+  // Use double-double
+  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
+
+  double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
+  double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
+
+  if (LIBC_LIKELY(upper_dd == lower_dd)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
+    return r;
+  }
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H
diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/__support/math/exp10f.h
similarity index 91%
rename from libc/src/math/generic/exp10f_impl.h
rename to libc/src/__support/math/exp10f.h
index 975fd01a0a25c..807b4f0d6c109 100644
--- a/libc/src/math/generic/exp10f_impl.h
+++ b/libc/src/__support/math/exp10f.h
@@ -1,4 +1,4 @@
-//===-- Single-precision 10^x function ------------------------------------===//
+//===-- Implementation header for exp10f ------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,22 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
-#define LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H
 
-#include "explogxf.h"
+#include "exp10f_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
 namespace LIBC_NAMESPACE_DECL {
-namespace generic {
+namespace math {
 
-LIBC_INLINE float exp10f(float x) {
+static constexpr float exp10f(float x) {
   using FPBits = typename fputil::FPBits<float>;
   FPBits xbits(x);
 
@@ -132,7 +131,7 @@ LIBC_INLINE float exp10f(float x) {
   return static_cast<float>(multiply_add(p, lo2 * rr.mh, c0 * rr.mh));
 }
 
-} // namespace generic
+} // namespace math
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H
diff --git a/libc/src/__support/math/exp10f_utils.h b/libc/src/__support/math/exp10f_utils.h
new file mode 100644
index 0000000000000..0493e1b993e0c
--- /dev/null
+++ b/libc/src/__support/math/exp10f_utils.h
@@ -0,0 +1,157 @@
+//===-- Common utils for exp10f ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+struct ExpBase {
+  // Base = e
+  static constexpr int MID_BITS = 5;
+  static constexpr int MID_MASK = (1 << MID_BITS) - 1;
+  // log2(e) * 2^5
+  static constexpr double LOG2_B = 0x1.71547652b82fep+0 * (1 << MID_BITS);
+  // High and low parts of -log(2) * 2^(-5)
+  static constexpr double M_LOGB_2_HI = -0x1.62e42fefa0000p-1 / (1 << MID_BITS);
+  static constexpr double M_LOGB_2_LO =
+      -0x1.cf79abc9e3b3ap-40 / (1 << MID_BITS);
+  // Look up table for bit fields of 2^(i/32) for i = 0..31, generated by Sollya
+  // with:
+  // > for i from 0 to 31 do printdouble(round(2^(i/32), D, RN));
+  static constexpr int64_t EXP_2_MID[1 << MID_BITS] = {
+      0x3ff0000000000000, 0x3ff059b0d3158574, 0x3ff0b5586cf9890f,
+      0x3ff11301d0125b51, 0x3ff172b83c7d517b, 0x3ff1d4873168b9aa,
+      0x3ff2387a6e756238, 0x3ff29e9df51fdee1, 0x3ff306fe0a31b715,
+      0x3ff371a7373aa9cb, 0x3ff3dea64c123422, 0x3ff44e086061892d,
+      0x3ff4bfdad5362a27, 0x3ff5342b569d4f82, 0x3ff5ab07dd485429,
+      0x3ff6247eb03a5585, 0x3ff6a09e667f3bcd, 0x3ff71f75e8ec5f74,
+      0x3ff7a11473eb0187, 0x3ff82589994cce13, 0x3ff8ace5422aa0db,
+      0x3ff93737b0cdc5e5, 0x3ff9c49182a3f090, 0x3ffa5503b23e255d,
+      0x3ffae89f995ad3ad, 0x3ffb7f76f2fb5e47, 0x3ffc199bdd85529c,
+      0x3ffcb720dcef9069, 0x3ffd5818dcfba487, 0x3ffdfc97337b9b5f,
+      0x3ffea4afa2a490da, 0x3fff50765b6e4540,
+  };
+
+  // Approximating e^dx with degree-5 minimax polynomial generated by Sollya:
+  // > Q = fpminimax(expm1(x)/x, 4, [|1, D...|], [-log(2)/64, log(2)/64]);
+  // Then:
+  //   e^dx ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[3] * dx^5.
+  static constexpr double COEFFS[4] = {
+      0x1.ffffffffe5bc8p-2, 0x1.555555555cd67p-3, 0x1.5555c2a9b48b4p-5,
+      0x1.11112a0e34bdbp-7};
+
+  LIBC_INLINE static double powb_lo(double dx) {
+    using fputil::multiply_add;
+    double dx2 = dx * dx;
+    double c0 = 1.0 + dx;
+    // c1 = COEFFS[0] + COEFFS[1] * dx
+    double c1 = multiply_add(dx, ExpBase::COEFFS[1], ExpBase::COEFFS[0]);
+    // c2 = COEFFS[2] + COEFFS[3] * dx
+    double c2 = multiply_add(dx, ExpBase::COEFFS[3], ExpBase::COEFFS[2]);
+    // r = c4 + c5 * dx^4
+    //   = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[5] * dx^7
+    return fputil::polyeval(dx2, c0, c1, c2);
+  }
+};
+
+struct Exp10Base : public ExpBase {
+  // log2(10) * 2^5
+  static constexpr double LOG2_B = 0x1.a934f0979a371p1 * (1 << MID_BITS);
+  // High and low parts of -log10(2) * 2^(-5).
+  // Notice that since |x * log2(10)| < 150:
+  //   |k| = |round(x * log2(10) * 2^5)| < 2^8 * 2^5 = 2^13
+  // So when the FMA instructions are not available, in order for the product
+  //   k * M_LOGB_2_HI
+  // to be exact, we only store the high part of log10(2) up to 38 bits
+  // (= 53 - 15) of precision.
+  // It is generated by Sollya with:
+  // > round(log10(2), 44, RN);
+  static constexpr double M_LOGB_2_HI = -0x1.34413509f8p-2 / (1 << MID_BITS);
+  // > round(log10(2) - 0x1.34413509f8p-2, D, RN);
+  static constexpr double M_LOGB_2_LO = 0x1.80433b83b532ap-44 / (1 << MID_BITS);
+
+  // Approximating 10^dx with degree-5 minimax polynomial generated by Sollya:
+  // > Q = fpminimax((10^x - 1)/x, 4, [|D...|], [-log10(2)/2^6, log10(2)/2^6]);
+  // Then:
+  //   10^dx ~ P(dx) = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
+  static constexpr double COEFFS[5] = {0x1.26bb1bbb55515p1, 0x1.53524c73bd3eap1,
+                                       0x1.0470591dff149p1, 0x1.2bd7c0a9fbc4dp0,
+                                       0x1.1429e74a98f43p-1};
+
+  static double powb_lo(double dx) {
+    using fputil::multiply_add;
+    double dx2 = dx * dx;
+    // c0 = 1 + COEFFS[0] * dx
+    double c0 = multiply_add(dx, Exp10Base::COEFFS[0], 1.0);
+    // c1 = COEFFS[1] + COEFFS[2] * dx
+    double c1 = multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
+    // c2 = COEFFS[3] + COEFFS[4] * dx
+    double c2 = multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
+    // r = c0 + dx^2 * (c1 + c2 * dx^2)
+    //   = c0 + c1 * dx^2 + c2 * dx^4
+    //   = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
+    return fputil::polyeval(dx2, c0, c1, c2);
+  }
+};
+
+// Output of range reduction for exp_b: (2^(mid + hi), lo)
+// where:
+//   b^x = 2^(mid + hi) * b^lo
+struct exp_b_reduc_t {
+  double mh; // 2^(mid + hi)
+  double lo;
+};
+
+// The function correctly calculates b^x value with at least float precision
+// in a limited range.
+// Range reduction:
+//   b^x = 2^(hi + mid) * b^lo
+// where:
+//   x = (hi + mid) * log_b(2) + lo
+//   hi is an integer,
+//   0 <= mid * 2^MID_BITS < 2^MID_BITS is an integer
+//   -2^(-MID_BITS - 1) <= lo * log2(b) <= 2^(-MID_BITS - 1)
+// Base class needs to provide the following constants:
+//   - MID_BITS    : number of bits after decimal points used for mid
+//   - MID_MASK    : 2^MID_BITS - 1, mask to extract mid bits
+//   - LOG2_B      : log2(b) * 2^MID_BITS for scaling
+//   - M_LOGB_2_HI : high part of -log_b(2) * 2^(-MID_BITS)
+//   - M_LOGB_2_LO : low part of -log_b(2) * 2^(-MID_BITS)
+//   - EXP_2_MID   : look up table for bit fields of 2^mid
+// Return:
+//   { 2^(hi + mid), lo }
+template <class Base>
+LIBC_INLINE static constexpr exp_b_reduc_t exp_b_range_reduc(float x) {
+  double xd = static_cast<double>(x);
+  // kd = round((hi + mid) * log2(b) * 2^MID_BITS)
+  double kd = fputil::nearest_integer(Base::LOG2_B * xd);
+  // k = round((hi + mid) * log2(b) * 2^MID_BITS)
+  int k = static_cast<int>(kd);
+  // hi = floor(kd * 2^(-MID_BITS))
+  // exp_hi = shift hi to the exponent field of double precision.
+  uint64_t exp_hi = static_cast<uint64_t>(k >> Base::MID_BITS)
+                    << fputil::FPBits<double>::FRACTION_LEN;
+  // mh = 2^hi * 2^mid
+  // mh_bits = bit field of mh
+  uint64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi;
+  double mh = fputil::FPBits<double>(mh_bits).get_val();
+  // dx = lo = x - (hi + mid) * log(2)
+  double dx = fputil::multiply_add(
+      kd, Base::M_LOGB_2_LO, fputil::multiply_add(kd, Base::M_LOGB_2_HI, xd));
+  return {mh, dx};
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
diff --git a/libc/src/__support/math/exp_constants.h b/libc/src/__support/math/exp_constants.h
new file mode 100644
index 0000000000000..1abb4479e0848
--- /dev/null
+++ b/libc/src/__support/math/exp_constants.h
@@ -0,0 +1,174 @@
+//===-- Constants for exp function ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_CONSTANTS_H
+
+#include "src/__support/FPUtil/triple_double.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// Lookup table for 2^(k * 2^-6) with k = 0..63.
+// Generated by Sollya with:
+// > display=hexadecimal;
+// > prec = 500;
+// > for i from 0 to 63 do {
+//     a = 2^(i * 2^-6);
+//     b = round(a, D, RN);
+//     c = round(a - b, D, RN);
+//     d = round(a - b - c, D, RN);
+//     print("{", d, ",", c, ",", b, "},");
+//   };
+static constexpr fputil::TripleDouble EXP2_MID1[64] = {
+    {0, 0, 0x1p0},
+    {-0x1.9085b0a3d74d5p-110, -0x1.19083535b085dp-56, 0x1.02c9a3e778061p0},
+    {0x1.05ff94f8d257ep-110, 0x1.d73e2a475b465p-55, 0x1.059b0d3158574p0},
+    {0x1.15820d96b414fp-111, 0x1.186be4bb284ffp-57, 0x1.0874518759bc8p0},
+    {-0x1.67c9bd6ebf74cp-108, 0x1.8a62e4adc610bp-54, 0x1.0b5586cf9890fp0},
+    {-0x1.5aa76994e9ddbp-113, 0x1.03a1727c57b53p-59, 0x1.0e3ec32d3d1a2p0},
+    {0x1.9d58b988f562dp-109, -0x1.6c51039449b3ap-54, 0x1.11301d0125b51p0},
+    {-0x1.2fe7bb4c76416p-108, -0x1.32fbf9af1369ep-54, 0x1.1429aaea92dep0},
+    {0x1.4f2406aa13ffp-109, -0x1.19041b9d78a76p-55, 0x1.172b83c7d517bp0},
+    {0x1.ad36183926ae8p-111, 0x1.e5b4c7b4968e4p-55, 0x1.1a35beb6fcb75p0},
+    {0x1.ea62d0881b918p-110, 0x1.e016e00a2643cp-54, 0x1.1d4873168b9aap0},
+    {-0x1.781dbc16f1ea4p-111, 0x1.dc775814a8495p-55, 0x1.2063b88628cd6p0},
+    {-0x1.4d89f9af532ep-109, 0x1.9b07eb6c70573p-54, 0x1.2387a6e756238p0},
+    {0x1.277393a461b77p-110, 0x1.2bd339940e9d9p-55, 0x1.26b4565e27cddp0},
+    {0x1.de5448560469p-111, 0x1.612e8afad1255p-55, 0x1.29e9df51fdee1p0},
+    {-0x1.ee9d8f8cb9307p-110, 0x1.0024754db41d5p-54, 0x1.2d285a6e4030bp0},
+    {0x1.7b7b2f09cd0d9p-110, 0x1.6f46ad23182e4p-55, 0x1.306fe0a31b715p0},
+    {-0x1.406a2ea6cfc6bp-108, 0x1.32721843659a6p-54, 0x1.33c08b26416ffp0},
+    {0x1.87e3e12516bfap-108, -0x1.63aeabf42eae2p-54, 0x1.371a7373aa9cbp0},
+    {0x1.9b0b1ff17c296p-111, -0x1.5e436d661f5e3p-56, 0x1.3a7db34e59ff7p0},
+    {-0x1.808ba68fa8fb7p-109, 0x1.ada0911f09ebcp-55, 0x1.3dea64c123422p0},
+    {-0x1.32b43eafc6518p-114, -0x1.ef3691c309278p-58, 0x1.4160a21f72e2ap0},
+    {-0x1.0ac312de3d922p-114, 0x1.89b7a04ef80dp-59, 0x1.44e086061892dp0},
+    {0x1.e1eebae743acp-111, 0x1.3c1a3b69062fp-56, 0x1.486a2b5c13cdp0},
+    {0x1.c06c7745c2b39p-113, 0x1.d4397afec42e2p-56, 0x1.4bfdad5362a27p0},
+    {-0x1.1aa1fd7b685cdp-112, -0x1.4b309d25957e3p-54, 0x1.4f9b2769d2ca7p0},
+    {0x1.fa733951f214cp-111, -0x1.07abe1db13cadp-55, 0x1.5342b569d4f82p0},
+    {-0x1.ff86852a613ffp-111, 0x1.9bb2c011d93adp-54, 0x1.56f4736b527dap0},
+    {-0x1.744ee506fdafep-109, 0x1.6324c054647adp-54, 0x1.5ab07dd485429p0},
+    {-0x1.95f9ab75fa7d6p-108, 0x1.ba6f93080e65ep-54, 0x1.5e76f15ad2148p0},
+    {0x1.5d8e757cfb991p-111, -0x1.383c17e40b497p-54, 0x1.6247eb03a5585p0},
+    {0x1.4a337f4dc0a3bp-108, -0x1.bb60987591c34p-54, 0x1.6623882552225p0},
+    {0x1.57d3e3adec175p-108, -0x1.bdd3413b26456p-54, 0x1.6a09e667f3bcdp0},
+    {0x1.a59f88abbe778p-115, -0x1.bbe3a683c88abp-57, 0x1.6dfb23c651a2fp0},
+    {-0x1.269796953a4c3p-109, -0x1.16e4786887a99p-55, 0x1.71f75e8ec5f74p0},
+    {-0x1.8f8e7fa19e5e8p-108, -0x1.0245957316dd3p-54, 0x1.75feb564267c9p0},
+    {-0x1.4217a932d10d4p-113, -0x1.41577ee04992fp-55, 0x1.7a11473eb0187p0},
+    {0x1.70a1427f8fcdfp-112, 0x1.05d02ba15797ep-56, 0x1.7e2f336cf4e62p0},
+    {0x1.0f6ad65cbbac1p-112, -0x1.d4c1dd41532d8p-54, 0x1.82589994cce13p0},
+    {-0x1.f16f65181d921p-109, -0x1.fc6f89bd4f6bap-54, 0x1.868d99b4492edp0},
+    {-0x1.30644a7836333p-110, 0x1.6e9f156864b27p-54, 0x1.8ace5422aa0dbp0},
+    {0x1.3bf26d2b85163p-114, 0x1.5cc13a2e3976cp-55, 0x1.8f1ae99157736p0},
+    {0x1.697e257ac0db2p-111, -0x1.75fc781b57ebcp-57, 0x1.93737b0cdc5e5p0},
+    {0x1.7edb9d7144b6fp-108, -0x1.d185b7c1b85d1p-54, 0x1.97d829fde4e5p0},
+    {0x1.6376b7943085cp-110, 0x1.c7c46b071f2bep-56, 0x1.9c49182a3f09p0},
+    {0x1.354084551b4fbp-109, -0x1.359495d1cd533p-54, 0x1.a0c667b5de565p0},
+    {-0x1.bfd7adfd63f48p-111, -0x1.d2f6edb8d41e1p-54, 0x1.a5503b23e255dp0},
+    {0x1.8b16ae39e8cb9p-109, 0x1.0fac90ef7fd31p-54, 0x1.a9e6b5579fdbfp0},
+    {0x1.a7fbc3ae675eap-108, 0x1.7a1cd345dcc81p-54, 0x1.ae89f995ad3adp0},
+    {0x1.2babc0edda4d9p-111, -0x1.2805e3084d708p-57, 0x1.b33a2b84f15fbp0},
+    {0x1.aa64481e1ab72p-111, -0x1.5584f7e54ac3bp-56, 0x1.b7f76f2fb5e47p0},
+    {0x1.9a164050e1258p-109, 0x1.23dd07a2d9e84p-55, 0x1.bcc1e904bc1d2p0},
+    {0x1.99e51125928dap-110, 0x1.11065895048ddp-55, 0x1.c199bdd85529cp0},
+    {-0x1.fc44c329d5cb2p-109, 0x1.2884dff483cadp-54, 0x1.c67f12e57d14bp0},
+    {0x1.d8765566b032ep-110, 0x1.503cbd1e949dbp-56, 0x1.cb720dcef9069p0},
+    {-0x1.e7044039da0f6p-108, -0x1.cbc3743797a9cp-54, 0x1.d072d4a07897cp0},
+    {-0x1.ab053b05531fcp-111, 0x1.2ed02d75b3707p-55, 0x1.d5818dcfba487p0},
+    {0x1.7f6246f0ec615p-108, 0x1.c2300696db532p-54, 0x1.da9e603db3285p0},
+    {0x1.b7225a944efd6p-108, -0x1.1a5cd4f184b5cp-54, 0x1.dfc97337b9b5fp0},
+    {0x1.1e92cb3c2d278p-109, 0x1.39e8980a9cc8fp-55, 0x1.e502ee78b3ff6p0},
+    {-0x1.fc0f242bbf3dep-109, -0x1.e9c23179c2893p-54, 0x1.ea4afa2a490dap0},
+    {0x1.f6dd5d229ff69p-108, 0x1.dc7f486a4b6bp-54, 0x1.efa1bee615a27p0},
+    {-0x1.4019bffc80ef3p-110, 0x1.9d3e12dd8a18bp-54, 0x1.f50765b6e454p0},
+    {0x1.dc060c36f7651p-112, 0x1.74853f3a5931ep-55, 0x1.fa7c1819e90d8p0},
+};
+
+// Lookup table for 2^(k * 2^-12) with k = 0..63.
+// Generated by Sollya with:
+// > display=hexadecimal;
+// > prec = 500;
+// > for i from 0 to 63 do {
+//     a = 2^(i * 2^-12);
+//     b = round(a, D, RN);
+//     c = round(a - b, D, RN);
+//     d = round(a - b - c, D, RN);
+//     print("{", d, ",", c, ",", b, "},");
+//   };
+static constexpr fputil::TripleDouble EXP2_MID2[64] = {
+    {0, 0, 0x1p0},
+    {0x1.39726694630e3p-108, 0x1.ae8e38c59c72ap-54, 0x1.000b175effdc7p0},
+    {0x1.e5e06ddd31156p-112, -0x1.7b5d0d58ea8f4p-58, 0x1.00162f3904052p0},
+    {0x1.5a0768b51f609p-111, 0x1.4115cb6b16a8ep-54, 0x1.0021478e11ce6p0},
+    {0x1.d008403605217p-111, -0x1.d7c96f201bb2fp-55, 0x1.002c605e2e8cfp0},
+    {0x1.89bc16f765708p-109, 0x1.84711d4c35e9fp-54, 0x1.003779a95f959p0},
+    {-0x1.4535b7f8c1e2dp-109, -0x1.0484245243777p-55, 0x1.0042936faa3d8p0},
+    {-0x1.8ba92f6b25456p-108, -0x1.4b237da2025f9p-54, 0x1.004dadb113dap0},
+    {-0x1.30c72e81f4294p-113, -0x1.5e00e62d6b30dp-56, 0x1.0058c86da1c0ap0},
+    {-0x1.34a5384e6f0b9p-110, 0x1.a1d6cedbb9481p-54, 0x1.0063e3a559473p0},
+    {0x1.f8d0580865d2ep-108, -0x1.4acf197a00142p-54, 0x1.006eff583fc3dp0},
+    {-0x1.002bcb3ae9a99p-111, -0x1.eaf2ea42391a5p-57, 0x1.007a1b865a8cap0},
+    {0x1.c3c5aedee9851p-111, 0x1.da93f90835f75p-56, 0x1.0085382faef83p0},
+    {0x1.7217851d1ec6ep-109, -0x1.6a79084ab093cp-55, 0x1.00905554425d4p0},
+    {-0x1.80cbca335a7c3p-110, 0x1.86364f8fbe8f8p-54, 0x1.009b72f41a12bp0},
+    {-0x1.706bd4eb22595p-110, -0x1.82e8e14e3110ep-55, 0x1.00a6910f3b6fdp0},
+    {-0x1.b55dd523f3c08p-111, -0x1.4f6b2a7609f71p-55, 0x1.00b1afa5abcbfp0},
+    {0x1.90a1e207cced1p-110, -0x1.e1a258ea8f71bp-56, 0x1.00bcceb7707ecp0},
+    {0x1.78d0472db37c5p-110, 0x1.4362ca5bc26f1p-56, 0x1.00c7ee448ee02p0},
+    {-0x1.bcd4db3cb52fep-109, 0x1.095a56c919d02p-54, 0x1.00d30e4d0c483p0},
+    {-0x1.cf1b131575ec2p-112, -0x1.406ac4e81a645p-57, 0x1.00de2ed0ee0f5p0},
+    {-0x1.6aaa1fa7ff913p-112, 0x1.b5a6902767e09p-54, 0x1.00e94fd0398ep0},
+    {0x1.68f236dff3218p-110, -0x1.91b2060859321p-54, 0x1.00f4714af41d3p0},
+    {-0x1.e8bb58067e60ap-109, 0x1.427068ab22306p-55, 0x1.00ff93412315cp0},
+    {0x1.d4cd5e1d71fdfp-108, 0x1.c1d0660524e08p-54, 0x1.010ab5b2cbd11p0},
+    {0x1.e4ecf350ebe88p-108, -0x1.e7bdfb3204be8p-54, 0x1.0115d89ff3a8bp0},
+    {0x1.6a2aa2c89c4f8p-109, 0x1.843aa8b9cbbc6p-55, 0x1.0120fc089ff63p0},
+    {0x1.1ca368a20ed05p-110, -0x1.34104ee7edae9p-56, 0x1.012c1fecd613bp0},
+    {0x1.edb1095d925cfp-114, -0x1.2b6aeb6176892p-56, 0x1.0137444c9b5b5p0},
+    {-0x1.488c78eded75fp-111, 0x1.a8cd33b8a1bb3p-56, 0x1.01426927f5278p0},
+    {-0x1.7480f5ea1b3c9p-113, 0x1.2edc08e5da99ap-56, 0x1.014d8e7ee8d2fp0},
+    {-0x1.ae45989a04dd5p-111, 0x1.57ba2dc7e0c73p-55, 0x1.0158b4517bb88p0},
+    {0x1.bf48007d80987p-109, 0x1.b61299ab8cdb7p-54, 0x1.0163da9fb3335p0},
+    {0x1.1aa91a059292cp-109, -0x1.90565902c5f44p-54, 0x1.016f0169949edp0},
+    {0x1.b6663292855f5p-110, 0x1.70fc41c5c2d53p-55, 0x1.017a28af25567p0},
+    {0x1.e7fbca6793d94p-108, 0x1.4b9a6e145d76cp-54, 0x1.018550706ab62p0},
+    {-0x1.5b9f5c7de3b93p-110, -0x1.008eff5142bf9p-56, 0x1.019078ad6a19fp0},
+    {0x1.4638bf2f6acabp-110, -0x1.77669f033c7dep-54, 0x1.019ba16628de2p0},
+    {-0x1.ab237b9a069c5p-109, -0x1.09bb78eeead0ap-54, 0x1.01a6ca9aac5f3p0},
+    {0x1.3ab358be97cefp-108, 0x1.371231477ece5p-54, 0x1.01b1f44af9f9ep0},
+    {-0x1.4027b2294bb64p-110, 0x1.5e7626621eb5bp-56, 0x1.01bd1e77170b4p0},
+    {0x1.656394426c99p-111, -0x1.bc72b100828a5p-54, 0x1.01c8491f08f08p0},
+    {0x1.bf9785189bdd8p-111, -0x1.ce39cbbab8bbep-57, 0x1.01d37442d507p0},
+    {0x1.7c12f86114fe3p-109, 0x1.16996709da2e2p-55, 0x1.01de9fe280ac8p0},
+    {-0x1.653d5d24b5d28p-109, -0x1.c11f5239bf535p-55, 0x1.01e9cbfe113efp0},
+    {0x1.04a0cdc1d86d7p-109, 0x1.e1d4eb5edc6b3p-55, 0x1.01f4f8958c1c6p0},
+    {0x1.c678c46149782p-109, -0x1.afb99946ee3fp-54, 0x1.020025a8f6a35p0},
+    {0x1.48524e1e9df7p-108, -0x1.8f06d8a148a32p-54, 0x1.020b533856324p0},
+    {0x1.9953ea727ff0bp-109, -0x1.2bf310fc54eb6p-55, 0x1.02168143b0281p0},
+    {-0x1.ccfbbec22d28ep-108, -0x1.c95a035eb4175p-54, 0x1.0221afcb09e3ep0},
+    {0x1.9e2bb6e181de1p-108, -0x1.491793e46834dp-54, 0x1.022cdece68c4fp0},
+    {0x1.f17609ae29308p-110, -0x1.3e8d0d9c49091p-56, 0x1.02380e4dd22adp0},
+    {-0x1.c7dc2c476bfb8p-110, -0x1.314aa16278aa3p-54, 0x1.02433e494b755p0},
+    {-0x1.fab994971d4a3p-109, 0x1.48daf888e9651p-55, 0x1.024e6ec0da046p0},
+    {0x1.848b62cbdd0afp-109, 0x1.56dc8046821f4p-55, 0x1.02599fb483385p0},
+    {-0x1.bf603ba715d0cp-109, 0x1.45b42356b9d47p-54, 0x1.0264d1244c719p0},
+    {0x1.89434e751e1aap-110, -0x1.082ef51b61d7ep-56, 0x1.027003103b10ep0},
+    {-0x1.03b54fd64e8acp-110, 0x1.2106ed0920a34p-56, 0x1.027b357854772p0},
+    {0x1.7785ea0acc486p-109, -0x1.fd4cf26ea5d0fp-54, 0x1.0286685c9e059p0},
+    {-0x1.ce447fdb35ff9p-109, -0x1.09f8775e78084p-54, 0x1.02919bbd1d1d8p0},
+    {0x1.5b884aab5642ap-112, 0x1.64cbba902ca27p-58, 0x1.029ccf99d720ap0},
+    {-0x1.cfb3e46d7c1cp-108, 0x1.4383ef231d207p-54, 0x1.02a803f2d170dp0},
+    {-0x1.0d40cee4b81afp-112, 0x1.4a47a505b3a47p-54, 0x1.02b338c811703p0},
+    {0x1.6ae7d36d7c1f7p-109, 0x1.e47120223467fp-54, 0x1.02be6e199c811p0},
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_CONSTANTS_H
diff --git a/libc/src/__support/math/exp_utils.h b/libc/src/__support/math/exp_utils.h
new file mode 100644
index 0000000000000..fc9ab10d76cc4
--- /dev/null
+++ b/libc/src/__support/math/exp_utils.h
@@ -0,0 +1,72 @@
+//===-- Common utils for exp function ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_UTILS_H
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/FPUtil/FPBits.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// Rounding tests for 2^hi * (mid + lo) when the output might be denormal. We
+// assume further that 1 <= mid < 2, mid + lo < 2, and |lo| << mid.
+// Notice that, if 0 < x < 2^-1022,
+//   double(2^-1022 + x) - 2^-1022 = double(x).
+// So if we scale x up by 2^1022, we can use
+//   double(1.0 + 2^1022 * x) - 1.0 to test how x is rounded in denormal range.
+template <bool SKIP_ZIV_TEST = false>
+static constexpr cpp::optional<double> ziv_test_denorm(int hi, double mid,
+                                                       double lo, double err) {
+  using FPBits = typename fputil::FPBits<double>;
+
+  // Scaling factor = 1/(min normal number) = 2^1022
+  int64_t exp_hi = static_cast<int64_t>(hi + 1022) << FPBits::FRACTION_LEN;
+  double mid_hi = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(mid));
+  double lo_scaled =
+      (lo != 0.0) ? cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(lo))
+                  : 0.0;
+
+  double extra_factor = 0.0;
+  uint64_t scale_down = 0x3FE0'0000'0000'0000; // 1022 in the exponent field.
+
+  // Result is denormal if (mid_hi + lo_scale < 1.0).
+  if ((1.0 - mid_hi) > lo_scaled) {
+    // Extra rounding step is needed, which adds more rounding errors.
+    err += 0x1.0p-52;
+    extra_factor = 1.0;
+    scale_down = 0x3FF0'0000'0000'0000; // 1023 in the exponent field.
+  }
+
+  // By adding 1.0, the results will have similar rounding points as denormal
+  // outputs.
+  if constexpr (SKIP_ZIV_TEST) {
+    double r = extra_factor + (mid_hi + lo_scaled);
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(r) - scale_down);
+  } else {
+    double err_scaled =
+        cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(err));
+
+    double lo_u = lo_scaled + err_scaled;
+    double lo_l = lo_scaled - err_scaled;
+
+    double upper = extra_factor + (mid_hi + lo_u);
+    double lower = extra_factor + (mid_hi + lo_l);
+
+    if (LIBC_LIKELY(upper == lower)) {
+      return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(upper) - scale_down);
+    }
+
+    return cpp::nullopt;
+  }
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_UTILS_H
diff --git a/libc/src/__support/math/expf16_utils.h b/libc/src/__support/math/expf16_utils.h
index bebb72b09b886..8a2fc9415ab8e 100644
--- a/libc/src/__support/math/expf16_utils.h
+++ b/libc/src/__support/math/expf16_utils.h
@@ -47,7 +47,7 @@ struct ExpRangeReduction {
   float exp_lo;
 };
 
-static constexpr ExpRangeReduction exp_range_reduction(float16 x) {
+[[maybe_unused]] static ExpRangeReduction exp_range_reduction(float16 x) {
   // For -18 < x < 12, to compute exp(x), we perform the following range
   // reduction: find hi, mid, lo, such that:
   //   x = hi + mid + lo, in which
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index d3fb58ed0c71c..802441d37fe92 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -6,6 +6,19 @@ add_header_library(
     libc.hdr.types.char32_t    
 )
 
+add_header_library(
+  string_converter
+  HDRS
+    string_converter.h
+  DEPENDS
+    libc.hdr.types.char8_t
+    libc.hdr.types.char32_t
+    libc.hdr.types.size_t
+    libc.src.__support.error_or
+    .mbstate
+    .character_converter 
+)
+
 add_object_library(
   character_converter
   HDRS
@@ -16,6 +29,7 @@ add_object_library(
     libc.hdr.errno_macros
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
+    libc.hdr.types.size_t
     libc.src.__support.error_or
     libc.src.__support.math_extras
     .mbstate
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3cacfa5689e4d..15d0f478a18a9 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -9,6 +9,7 @@
 #include "hdr/errno_macros.h"
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
@@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
     state->bytes_stored++;
     return 0;
   }
+
   // Invalid byte -> reset the state
   clear();
   return EILSEQ;
@@ -130,6 +132,12 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
   return utf32;
 }
 
+size_t CharacterConverter::sizeAsUTF32() {
+  return 1; // a single utf-32 value can fit an entire character
+}
+
+size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }
+
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isEmpty())
     return Error(-1);
@@ -156,6 +164,9 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   }
 
   state->bytes_stored--;
+  if (state->bytes_stored == 0)
+    clear();
+
   return static_cast<char8_t>(output);
 }
 
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d9a63fdc0522c..b6d918f2d2edc 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,6 +11,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
@@ -30,6 +31,9 @@ class CharacterConverter {
   bool isEmpty();
   bool isValidState();
 
+  size_t sizeAsUTF32();
+  size_t sizeAsUTF8();
+
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
 
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
new file mode 100644
index 0000000000000..869ebdfc8b390
--- /dev/null
+++ b/libc/src/__support/wchar/string_converter.h
@@ -0,0 +1,116 @@
+//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+template <typename T> class StringConverter {
+private:
+  CharacterConverter cr;
+  const T *src;
+  size_t src_len;
+  size_t src_idx;
+
+  // # of pops we are allowed to perform (essentially size of the dest buffer)
+  size_t num_to_write;
+
+  ErrorOr<size_t> pushFullCharacter() {
+    size_t num_pushed;
+    for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
+         ++num_pushed) {
+      int err = cr.push(src[src_idx + num_pushed]);
+      if (err != 0)
+        return Error(err);
+    }
+
+    // if we aren't able to read a full character from the source string
+    if (src_idx + num_pushed == src_len && !cr.isFull()) {
+      src_idx += num_pushed;
+      return Error(-1);
+    }
+
+    return num_pushed;
+  }
+
+public:
+  StringConverter(const T *s, mbstate *ps, size_t dstlen,
+                  size_t srclen = SIZE_MAX)
+      : cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen) {}
+
+  // TODO: following functions are almost identical
+  // look into templating CharacterConverter pop functions
+  ErrorOr<char32_t> popUTF32() {
+    if (num_to_write == 0)
+      return Error(-1);
+
+    if (cr.isEmpty() || src_idx == 0) {
+      auto src_elements_read = pushFullCharacter();
+      if (!src_elements_read.has_value())
+        return Error(src_elements_read.error());
+
+      if (cr.sizeAsUTF32() > num_to_write) {
+        cr.clear();
+        return Error(-1);
+      }
+
+      src_idx += src_elements_read.value();
+    }
+
+    auto out = cr.pop_utf32();
+    if (out.has_value() && out.value() == L'\0')
+      src_len = src_idx;
+
+    num_to_write--;
+
+    return out;
+  }
+
+  ErrorOr<char8_t> popUTF8() {
+    if (num_to_write == 0)
+      return Error(-1);
+
+    if (cr.isEmpty() || src_idx == 0) {
+      auto src_elements_read = pushFullCharacter();
+      if (!src_elements_read.has_value())
+        return Error(src_elements_read.error());
+
+      if (cr.sizeAsUTF8() > num_to_write) {
+        cr.clear();
+        return Error(-1);
+      }
+
+      src_idx += src_elements_read.value();
+    }
+
+    auto out = cr.pop_utf8();
+    if (out.has_value() && out.value() == '\0')
+      src_len = src_idx;
+
+    num_to_write--;
+
+    return out;
+  }
+
+  size_t getSourceIndex() { return src_idx; }
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index a9237741788ae..99db743315d43 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -358,7 +358,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fma
-    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
 )
@@ -448,7 +447,6 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
-    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
@@ -1312,20 +1310,7 @@ add_entrypoint_object(
   HDRS
     ../exp.h
   DEPENDS
-    .common_constants
-    .explogxf
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.FPUtil.triple_double
-    libc.src.__support.integer_literals
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.exp
     libc.src.errno.errno
 )
 
@@ -1470,35 +1455,7 @@ add_entrypoint_object(
   HDRS
     ../exp10.h
   DEPENDS
-    .common_constants
-    .explogxf
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.FPUtil.triple_double
-    libc.src.__support.integer_literals
-    libc.src.__support.macros.optimization
-    libc.src.errno.errno
-)
-
-add_header_library(
-  exp10f_impl
-  HDRS
-    exp10f_impl.h
-  DEPENDS
-    .explogxf
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.common
+    libc.src.__support.math.exp10
     libc.src.errno.errno
 )
 
@@ -1509,7 +1466,8 @@ add_entrypoint_object(
   HDRS
     ../exp10f.h
   DEPENDS
-    .exp10f_impl
+    libc.src.__support.math.exp10f
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
@@ -1646,17 +1604,15 @@ add_entrypoint_object(
     ../powf.h
   DEPENDS
     .common_constants
-    .exp10f_impl
     .exp2f_impl
     .explogxf
+    libc.src.__support.math.exp10f
     libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.FPUtil.triple_double
     libc.src.__support.macros.optimization
@@ -1953,8 +1909,8 @@ add_object_library(
   SRCS
     common_constants.cpp
   DEPENDS
+    libc.src.__support.math.exp_constants
     libc.src.__support.number_pair
-    libc.src.__support.FPUtil.triple_double
 )
 
 add_header_library(
@@ -3810,24 +3766,15 @@ add_entrypoint_object(
 )
 
 #TODO: Add errno include to the hyperbolic functions.
-add_object_library(
+add_header_library(
   explogxf
   HDRS
     explogxf.h
-  SRCS
-    explogxf.cpp
   DEPENDS
     .common_constants
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.common
+    libc.src.__support.math.exp_utils
+    libc.src.__support.math.exp10f_utils
+    libc.src.__support.macros.properties.cpu_features
     libc.src.errno.errno
 )
 
@@ -4010,6 +3957,7 @@ add_entrypoint_object(
   DEPENDS
     .explogxf
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.macros.optimization
 )
 
diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp
index 2149314d2f676..f6fde766ef785 100644
--- a/libc/src/math/generic/atanhf.cpp
+++ b/libc/src/math/generic/atanhf.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanhf.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/math/generic/common_constants.cpp
index b2c1293c6326d..4dcf84d00ad50 100644
--- a/libc/src/math/generic/common_constants.cpp
+++ b/libc/src/math/generic/common_constants.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "common_constants.h"
-#include "src/__support/FPUtil/triple_double.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/number_pair.h"
 
@@ -728,160 +727,4 @@ const double EXP_M2[128] = {
     0x1.568bb722dd593p1, 0x1.593b7d72305bbp1,
 };
 
-// Lookup table for 2^(k * 2^-6) with k = 0..63.
-// Generated by Sollya with:
-// > display=hexadecimal;
-// > prec = 500;
-// > for i from 0 to 63 do {
-//     a = 2^(i * 2^-6);
-//     b = round(a, D, RN);
-//     c = round(a - b, D, RN);
-//     d = round(a - b - c, D, RN);
-//     print("{", d, ",", c, ",", b, "},");
-//   };
-alignas(16) const fputil::TripleDouble EXP2_MID1[64] = {
-    {0, 0, 0x1p0},
-    {-0x1.9085b0a3d74d5p-110, -0x1.19083535b085dp-56, 0x1.02c9a3e778061p0},
-    {0x1.05ff94f8d257ep-110, 0x1.d73e2a475b465p-55, 0x1.059b0d3158574p0},
-    {0x1.15820d96b414fp-111, 0x1.186be4bb284ffp-57, 0x1.0874518759bc8p0},
-    {-0x1.67c9bd6ebf74cp-108, 0x1.8a62e4adc610bp-54, 0x1.0b5586cf9890fp0},
-    {-0x1.5aa76994e9ddbp-113, 0x1.03a1727c57b53p-59, 0x1.0e3ec32d3d1a2p0},
-    {0x1.9d58b988f562dp-109, -0x1.6c51039449b3ap-54, 0x1.11301d0125b51p0},
-    {-0x1.2fe7bb4c76416p-108, -0x1.32fbf9af1369ep-54, 0x1.1429aaea92dep0},
-    {0x1.4f2406aa13ffp-109, -0x1.19041b9d78a76p-55, 0x1.172b83c7d517bp0},
-    {0x1.ad36183926ae8p-111, 0x1.e5b4c7b4968e4p-55, 0x1.1a35beb6fcb75p0},
-    {0x1.ea62d0881b918p-110, 0x1.e016e00a2643cp-54, 0x1.1d4873168b9aap0},
-    {-0x1.781dbc16f1ea4p-111, 0x1.dc775814a8495p-55, 0x1.2063b88628cd6p0},
-    {-0x1.4d89f9af532ep-109, 0x1.9b07eb6c70573p-54, 0x1.2387a6e756238p0},
-    {0x1.277393a461b77p-110, 0x1.2bd339940e9d9p-55, 0x1.26b4565e27cddp0},
-    {0x1.de5448560469p-111, 0x1.612e8afad1255p-55, 0x1.29e9df51fdee1p0},
-    {-0x1.ee9d8f8cb9307p-110, 0x1.0024754db41d5p-54, 0x1.2d285a6e4030bp0},
-    {0x1.7b7b2f09cd0d9p-110, 0x1.6f46ad23182e4p-55, 0x1.306fe0a31b715p0},
-    {-0x1.406a2ea6cfc6bp-108, 0x1.32721843659a6p-54, 0x1.33c08b26416ffp0},
-    {0x1.87e3e12516bfap-108, -0x1.63aeabf42eae2p-54, 0x1.371a7373aa9cbp0},
-    {0x1.9b0b1ff17c296p-111, -0x1.5e436d661f5e3p-56, 0x1.3a7db34e59ff7p0},
-    {-0x1.808ba68fa8fb7p-109, 0x1.ada0911f09ebcp-55, 0x1.3dea64c123422p0},
-    {-0x1.32b43eafc6518p-114, -0x1.ef3691c309278p-58, 0x1.4160a21f72e2ap0},
-    {-0x1.0ac312de3d922p-114, 0x1.89b7a04ef80dp-59, 0x1.44e086061892dp0},
-    {0x1.e1eebae743acp-111, 0x1.3c1a3b69062fp-56, 0x1.486a2b5c13cdp0},
-    {0x1.c06c7745c2b39p-113, 0x1.d4397afec42e2p-56, 0x1.4bfdad5362a27p0},
-    {-0x1.1aa1fd7b685cdp-112, -0x1.4b309d25957e3p-54, 0x1.4f9b2769d2ca7p0},
-    {0x1.fa733951f214cp-111, -0x1.07abe1db13cadp-55, 0x1.5342b569d4f82p0},
-    {-0x1.ff86852a613ffp-111, 0x1.9bb2c011d93adp-54, 0x1.56f4736b527dap0},
-    {-0x1.744ee506fdafep-109, 0x1.6324c054647adp-54, 0x1.5ab07dd485429p0},
-    {-0x1.95f9ab75fa7d6p-108, 0x1.ba6f93080e65ep-54, 0x1.5e76f15ad2148p0},
-    {0x1.5d8e757cfb991p-111, -0x1.383c17e40b497p-54, 0x1.6247eb03a5585p0},
-    {0x1.4a337f4dc0a3bp-108, -0x1.bb60987591c34p-54, 0x1.6623882552225p0},
-    {0x1.57d3e3adec175p-108, -0x1.bdd3413b26456p-54, 0x1.6a09e667f3bcdp0},
-    {0x1.a59f88abbe778p-115, -0x1.bbe3a683c88abp-57, 0x1.6dfb23c651a2fp0},
-    {-0x1.269796953a4c3p-109, -0x1.16e4786887a99p-55, 0x1.71f75e8ec5f74p0},
-    {-0x1.8f8e7fa19e5e8p-108, -0x1.0245957316dd3p-54, 0x1.75feb564267c9p0},
-    {-0x1.4217a932d10d4p-113, -0x1.41577ee04992fp-55, 0x1.7a11473eb0187p0},
-    {0x1.70a1427f8fcdfp-112, 0x1.05d02ba15797ep-56, 0x1.7e2f336cf4e62p0},
-    {0x1.0f6ad65cbbac1p-112, -0x1.d4c1dd41532d8p-54, 0x1.82589994cce13p0},
-    {-0x1.f16f65181d921p-109, -0x1.fc6f89bd4f6bap-54, 0x1.868d99b4492edp0},
-    {-0x1.30644a7836333p-110, 0x1.6e9f156864b27p-54, 0x1.8ace5422aa0dbp0},
-    {0x1.3bf26d2b85163p-114, 0x1.5cc13a2e3976cp-55, 0x1.8f1ae99157736p0},
-    {0x1.697e257ac0db2p-111, -0x1.75fc781b57ebcp-57, 0x1.93737b0cdc5e5p0},
-    {0x1.7edb9d7144b6fp-108, -0x1.d185b7c1b85d1p-54, 0x1.97d829fde4e5p0},
-    {0x1.6376b7943085cp-110, 0x1.c7c46b071f2bep-56, 0x1.9c49182a3f09p0},
-    {0x1.354084551b4fbp-109, -0x1.359495d1cd533p-54, 0x1.a0c667b5de565p0},
-    {-0x1.bfd7adfd63f48p-111, -0x1.d2f6edb8d41e1p-54, 0x1.a5503b23e255dp0},
-    {0x1.8b16ae39e8cb9p-109, 0x1.0fac90ef7fd31p-54, 0x1.a9e6b5579fdbfp0},
-    {0x1.a7fbc3ae675eap-108, 0x1.7a1cd345dcc81p-54, 0x1.ae89f995ad3adp0},
-    {0x1.2babc0edda4d9p-111, -0x1.2805e3084d708p-57, 0x1.b33a2b84f15fbp0},
-    {0x1.aa64481e1ab72p-111, -0x1.5584f7e54ac3bp-56, 0x1.b7f76f2fb5e47p0},
-    {0x1.9a164050e1258p-109, 0x1.23dd07a2d9e84p-55, 0x1.bcc1e904bc1d2p0},
-    {0x1.99e51125928dap-110, 0x1.11065895048ddp-55, 0x1.c199bdd85529cp0},
-    {-0x1.fc44c329d5cb2p-109, 0x1.2884dff483cadp-54, 0x1.c67f12e57d14bp0},
-    {0x1.d8765566b032ep-110, 0x1.503cbd1e949dbp-56, 0x1.cb720dcef9069p0},
-    {-0x1.e7044039da0f6p-108, -0x1.cbc3743797a9cp-54, 0x1.d072d4a07897cp0},
-    {-0x1.ab053b05531fcp-111, 0x1.2ed02d75b3707p-55, 0x1.d5818dcfba487p0},
-    {0x1.7f6246f0ec615p-108, 0x1.c2300696db532p-54, 0x1.da9e603db3285p0},
-    {0x1.b7225a944efd6p-108, -0x1.1a5cd4f184b5cp-54, 0x1.dfc97337b9b5fp0},
-    {0x1.1e92cb3c2d278p-109, 0x1.39e8980a9cc8fp-55, 0x1.e502ee78b3ff6p0},
-    {-0x1.fc0f242bbf3dep-109, -0x1.e9c23179c2893p-54, 0x1.ea4afa2a490dap0},
-    {0x1.f6dd5d229ff69p-108, 0x1.dc7f486a4b6bp-54, 0x1.efa1bee615a27p0},
-    {-0x1.4019bffc80ef3p-110, 0x1.9d3e12dd8a18bp-54, 0x1.f50765b6e454p0},
-    {0x1.dc060c36f7651p-112, 0x1.74853f3a5931ep-55, 0x1.fa7c1819e90d8p0},
-};
-
-// Lookup table for 2^(k * 2^-12) with k = 0..63.
-// Generated by Sollya with:
-// > display=hexadecimal;
-// > prec = 500;
-// > for i from 0 to 63 do {
-//     a = 2^(i * 2^-12);
-//     b = round(a, D, RN);
-//     c = round(a - b, D, RN);
-//     d = round(a - b - c, D, RN);
-//     print("{", d, ",", c, ",", b, "},");
-//   };
-alignas(16) const fputil::TripleDouble EXP2_MID2[64] = {
-    {0, 0, 0x1p0},
-    {0x1.39726694630e3p-108, 0x1.ae8e38c59c72ap-54, 0x1.000b175effdc7p0},
-    {0x1.e5e06ddd31156p-112, -0x1.7b5d0d58ea8f4p-58, 0x1.00162f3904052p0},
-    {0x1.5a0768b51f609p-111, 0x1.4115cb6b16a8ep-54, 0x1.0021478e11ce6p0},
-    {0x1.d008403605217p-111, -0x1.d7c96f201bb2fp-55, 0x1.002c605e2e8cfp0},
-    {0x1.89bc16f765708p-109, 0x1.84711d4c35e9fp-54, 0x1.003779a95f959p0},
-    {-0x1.4535b7f8c1e2dp-109, -0x1.0484245243777p-55, 0x1.0042936faa3d8p0},
-    {-0x1.8ba92f6b25456p-108, -0x1.4b237da2025f9p-54, 0x1.004dadb113dap0},
-    {-0x1.30c72e81f4294p-113, -0x1.5e00e62d6b30dp-56, 0x1.0058c86da1c0ap0},
-    {-0x1.34a5384e6f0b9p-110, 0x1.a1d6cedbb9481p-54, 0x1.0063e3a559473p0},
-    {0x1.f8d0580865d2ep-108, -0x1.4acf197a00142p-54, 0x1.006eff583fc3dp0},
-    {-0x1.002bcb3ae9a99p-111, -0x1.eaf2ea42391a5p-57, 0x1.007a1b865a8cap0},
-    {0x1.c3c5aedee9851p-111, 0x1.da93f90835f75p-56, 0x1.0085382faef83p0},
-    {0x1.7217851d1ec6ep-109, -0x1.6a79084ab093cp-55, 0x1.00905554425d4p0},
-    {-0x1.80cbca335a7c3p-110, 0x1.86364f8fbe8f8p-54, 0x1.009b72f41a12bp0},
-    {-0x1.706bd4eb22595p-110, -0x1.82e8e14e3110ep-55, 0x1.00a6910f3b6fdp0},
-    {-0x1.b55dd523f3c08p-111, -0x1.4f6b2a7609f71p-55, 0x1.00b1afa5abcbfp0},
-    {0x1.90a1e207cced1p-110, -0x1.e1a258ea8f71bp-56, 0x1.00bcceb7707ecp0},
-    {0x1.78d0472db37c5p-110, 0x1.4362ca5bc26f1p-56, 0x1.00c7ee448ee02p0},
-    {-0x1.bcd4db3cb52fep-109, 0x1.095a56c919d02p-54, 0x1.00d30e4d0c483p0},
-    {-0x1.cf1b131575ec2p-112, -0x1.406ac4e81a645p-57, 0x1.00de2ed0ee0f5p0},
-    {-0x1.6aaa1fa7ff913p-112, 0x1.b5a6902767e09p-54, 0x1.00e94fd0398ep0},
-    {0x1.68f236dff3218p-110, -0x1.91b2060859321p-54, 0x1.00f4714af41d3p0},
-    {-0x1.e8bb58067e60ap-109, 0x1.427068ab22306p-55, 0x1.00ff93412315cp0},
-    {0x1.d4cd5e1d71fdfp-108, 0x1.c1d0660524e08p-54, 0x1.010ab5b2cbd11p0},
-    {0x1.e4ecf350ebe88p-108, -0x1.e7bdfb3204be8p-54, 0x1.0115d89ff3a8bp0},
-    {0x1.6a2aa2c89c4f8p-109, 0x1.843aa8b9cbbc6p-55, 0x1.0120fc089ff63p0},
-    {0x1.1ca368a20ed05p-110, -0x1.34104ee7edae9p-56, 0x1.012c1fecd613bp0},
-    {0x1.edb1095d925cfp-114, -0x1.2b6aeb6176892p-56, 0x1.0137444c9b5b5p0},
-    {-0x1.488c78eded75fp-111, 0x1.a8cd33b8a1bb3p-56, 0x1.01426927f5278p0},
-    {-0x1.7480f5ea1b3c9p-113, 0x1.2edc08e5da99ap-56, 0x1.014d8e7ee8d2fp0},
-    {-0x1.ae45989a04dd5p-111, 0x1.57ba2dc7e0c73p-55, 0x1.0158b4517bb88p0},
-    {0x1.bf48007d80987p-109, 0x1.b61299ab8cdb7p-54, 0x1.0163da9fb3335p0},
-    {0x1.1aa91a059292cp-109, -0x1.90565902c5f44p-54, 0x1.016f0169949edp0},
-    {0x1.b6663292855f5p-110, 0x1.70fc41c5c2d53p-55, 0x1.017a28af25567p0},
-    {0x1.e7fbca6793d94p-108, 0x1.4b9a6e145d76cp-54, 0x1.018550706ab62p0},
-    {-0x1.5b9f5c7de3b93p-110, -0x1.008eff5142bf9p-56, 0x1.019078ad6a19fp0},
-    {0x1.4638bf2f6acabp-110, -0x1.77669f033c7dep-54, 0x1.019ba16628de2p0},
-    {-0x1.ab237b9a069c5p-109, -0x1.09bb78eeead0ap-54, 0x1.01a6ca9aac5f3p0},
-    {0x1.3ab358be97cefp-108, 0x1.371231477ece5p-54, 0x1.01b1f44af9f9ep0},
-    {-0x1.4027b2294bb64p-110, 0x1.5e7626621eb5bp-56, 0x1.01bd1e77170b4p0},
-    {0x1.656394426c99p-111, -0x1.bc72b100828a5p-54, 0x1.01c8491f08f08p0},
-    {0x1.bf9785189bdd8p-111, -0x1.ce39cbbab8bbep-57, 0x1.01d37442d507p0},
-    {0x1.7c12f86114fe3p-109, 0x1.16996709da2e2p-55, 0x1.01de9fe280ac8p0},
-    {-0x1.653d5d24b5d28p-109, -0x1.c11f5239bf535p-55, 0x1.01e9cbfe113efp0},
-    {0x1.04a0cdc1d86d7p-109, 0x1.e1d4eb5edc6b3p-55, 0x1.01f4f8958c1c6p0},
-    {0x1.c678c46149782p-109, -0x1.afb99946ee3fp-54, 0x1.020025a8f6a35p0},
-    {0x1.48524e1e9df7p-108, -0x1.8f06d8a148a32p-54, 0x1.020b533856324p0},
-    {0x1.9953ea727ff0bp-109, -0x1.2bf310fc54eb6p-55, 0x1.02168143b0281p0},
-    {-0x1.ccfbbec22d28ep-108, -0x1.c95a035eb4175p-54, 0x1.0221afcb09e3ep0},
-    {0x1.9e2bb6e181de1p-108, -0x1.491793e46834dp-54, 0x1.022cdece68c4fp0},
-    {0x1.f17609ae29308p-110, -0x1.3e8d0d9c49091p-56, 0x1.02380e4dd22adp0},
-    {-0x1.c7dc2c476bfb8p-110, -0x1.314aa16278aa3p-54, 0x1.02433e494b755p0},
-    {-0x1.fab994971d4a3p-109, 0x1.48daf888e9651p-55, 0x1.024e6ec0da046p0},
-    {0x1.848b62cbdd0afp-109, 0x1.56dc8046821f4p-55, 0x1.02599fb483385p0},
-    {-0x1.bf603ba715d0cp-109, 0x1.45b42356b9d47p-54, 0x1.0264d1244c719p0},
-    {0x1.89434e751e1aap-110, -0x1.082ef51b61d7ep-56, 0x1.027003103b10ep0},
-    {-0x1.03b54fd64e8acp-110, 0x1.2106ed0920a34p-56, 0x1.027b357854772p0},
-    {0x1.7785ea0acc486p-109, -0x1.fd4cf26ea5d0fp-54, 0x1.0286685c9e059p0},
-    {-0x1.ce447fdb35ff9p-109, -0x1.09f8775e78084p-54, 0x1.02919bbd1d1d8p0},
-    {0x1.5b884aab5642ap-112, 0x1.64cbba902ca27p-58, 0x1.029ccf99d720ap0},
-    {-0x1.cfb3e46d7c1cp-108, 0x1.4383ef231d207p-54, 0x1.02a803f2d170dp0},
-    {-0x1.0d40cee4b81afp-112, 0x1.4a47a505b3a47p-54, 0x1.02b338c811703p0},
-    {0x1.6ae7d36d7c1f7p-109, 0x1.e47120223467fp-54, 0x1.02be6e199c811p0},
-};
-
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
index e65f002845953..291816a7889ad 100644
--- a/libc/src/math/generic/common_constants.h
+++ b/libc/src/math/generic/common_constants.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/FPUtil/triple_double.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/math/exp_constants.h"
 #include "src/__support/number_pair.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -80,12 +81,6 @@ extern const double EXP_M1[195];
 // > for i from 0 to 127 do { D(exp(i / 128)); };
 extern const double EXP_M2[128];
 
-// Lookup table for 2^(k * 2^-6) with k = 0..63.
-extern const fputil::TripleDouble EXP2_MID1[64];
-
-// Lookup table for 2^(k * 2^-12) with k = 0..63.
-extern const fputil::TripleDouble EXP2_MID2[64];
-
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp
index c869f7d9dec5f..9f87564d524a6 100644
--- a/libc/src/math/generic/coshf.cpp
+++ b/libc/src/math/generic/coshf.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/coshf.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index 143800ca078a6..dc4d2ca480cb8 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -7,434 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
-#include "explogxf.h"         // ziv_test_denorm.
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/dyadic_float.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/common.h"
-#include "src/__support/integer_literals.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-
+#include "src/__support/math/exp.h"
 namespace LIBC_NAMESPACE_DECL {
 
-using fputil::DoubleDouble;
-using fputil::TripleDouble;
-using Float128 = typename fputil::DyadicFloat<128>;
-
-using LIBC_NAMESPACE::operator""_u128;
-
-// log2(e)
-constexpr double LOG2_E = 0x1.71547652b82fep+0;
-
-// Error bounds:
-// Errors when using double precision.
-constexpr double ERR_D = 0x1.8p-63;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Errors when using double-double precision.
-constexpr double ERR_DD = 0x1.0p-99;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// -2^-12 * log(2)
-// > a = -2^-12 * log(2);
-// > b = round(a, 30, RN);
-// > c = round(a - b, 30, RN);
-// > d = round(a - b - c, D, RN);
-// Errors < 1.5 * 2^-133
-constexpr double MLOG_2_EXP2_M12_HI = -0x1.62e42ffp-13;
-constexpr double MLOG_2_EXP2_M12_MID = 0x1.718432a1b0e26p-47;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr double MLOG_2_EXP2_M12_MID_30 = 0x1.718432ap-47;
-constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-namespace {
-
-// Polynomial approximations with double precision:
-// Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
-// For |dx| < 2^-13 + 2^-30:
-//   | output - expm1(dx) / dx | < 2^-51.
-LIBC_INLINE double poly_approx_d(double dx) {
-  // dx^2
-  double dx2 = dx * dx;
-  // c0 = 1 + dx / 2
-  double c0 = fputil::multiply_add(dx, 0.5, 1.0);
-  // c1 = 1/6 + dx / 24
-  double c1 =
-      fputil::multiply_add(dx, 0x1.5555555555555p-5, 0x1.5555555555555p-3);
-  // p = dx^2 * c1 + c0 = 1 + dx / 2 + dx^2 / 6 + dx^3 / 24
-  double p = fputil::multiply_add(dx2, c1, c0);
-  return p;
-}
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Polynomial approximation with double-double precision:
-// Return exp(dx) ~ 1 + dx + dx^2 / 2 + ... + dx^6 / 720
-// For |dx| < 2^-13 + 2^-30:
-//   | output - exp(dx) | < 2^-101
-DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
-  // Taylor polynomial.
-  constexpr DoubleDouble COEFFS[] = {
-      {0, 0x1p0},                                      // 1
-      {0, 0x1p0},                                      // 1
-      {0, 0x1p-1},                                     // 1/2
-      {0x1.5555555555555p-57, 0x1.5555555555555p-3},   // 1/6
-      {0x1.5555555555555p-59, 0x1.5555555555555p-5},   // 1/24
-      {0x1.1111111111111p-63, 0x1.1111111111111p-7},   // 1/120
-      {-0x1.f49f49f49f49fp-65, 0x1.6c16c16c16c17p-10}, // 1/720
-  };
-
-  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
-                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
-  return p;
-}
-
-// Polynomial approximation with 128-bit precision:
-// Return exp(dx) ~ 1 + dx + dx^2 / 2 + ... + dx^7 / 5040
-// For |dx| < 2^-13 + 2^-30:
-//   | output - exp(dx) | < 2^-126.
-Float128 poly_approx_f128(const Float128 &dx) {
-  constexpr Float128 COEFFS_128[]{
-      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
-      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
-      {Sign::POS, -128, 0x80000000'00000000'00000000'00000000_u128}, // 0.5
-      {Sign::POS, -130, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128}, // 1/6
-      {Sign::POS, -132, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128}, // 1/24
-      {Sign::POS, -134, 0x88888888'88888888'88888888'88888889_u128}, // 1/120
-      {Sign::POS, -137, 0xb60b60b6'0b60b60b'60b60b60'b60b60b6_u128}, // 1/720
-      {Sign::POS, -140, 0xd00d00d0'0d00d00d'00d00d00'd00d00d0_u128}, // 1/5040
-  };
-
-  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
-                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
-                                COEFFS_128[6], COEFFS_128[7]);
-  return p;
-}
-
-// Compute exp(x) using 128-bit precision.
-// TODO(lntue): investigate triple-double precision implementation for this
-// step.
-Float128 exp_f128(double x, double kd, int idx1, int idx2) {
-  // Recalculate dx:
-
-  double t1 = fputil::multiply_add(kd, MLOG_2_EXP2_M12_HI, x); // exact
-  double t2 = kd * MLOG_2_EXP2_M12_MID_30;                     // exact
-  double t3 = kd * MLOG_2_EXP2_M12_LO;                         // Error < 2^-133
-
-  Float128 dx = fputil::quick_add(
-      Float128(t1), fputil::quick_add(Float128(t2), Float128(t3)));
-
-  // TODO: Skip recalculating exp_mid1 and exp_mid2.
-  Float128 exp_mid1 =
-      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
-                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
-                                          Float128(EXP2_MID1[idx1].lo)));
-
-  Float128 exp_mid2 =
-      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
-                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
-                                          Float128(EXP2_MID2[idx2].lo)));
-
-  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
-
-  Float128 p = poly_approx_f128(dx);
-
-  Float128 r = fputil::quick_mul(exp_mid, p);
-
-  r.exponent += static_cast<int>(kd) >> 12;
-
-  return r;
-}
-
-// Compute exp(x) with double-double precision.
-DoubleDouble exp_double_double(double x, double kd,
-                               const DoubleDouble &exp_mid) {
-  // Recalculate dx:
-  //   dx = x - k * 2^-12 * log(2)
-  double t1 = fputil::multiply_add(kd, MLOG_2_EXP2_M12_HI, x); // exact
-  double t2 = kd * MLOG_2_EXP2_M12_MID_30;                     // exact
-  double t3 = kd * MLOG_2_EXP2_M12_LO;                         // Error < 2^-130
-
-  DoubleDouble dx = fputil::exact_add(t1, t2);
-  dx.lo += t3;
-
-  // Degree-6 Taylor polynomial approximation in double-double precision.
-  // | p - exp(x) | < 2^-100.
-  DoubleDouble p = poly_approx_dd(dx);
-
-  // Error bounds: 2^-99.
-  DoubleDouble r = fputil::quick_mult(exp_mid, p);
-
-  return r;
-}
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// Check for exceptional cases when
-// |x| <= 2^-53 or x < log(2^-1075) or x >= 0x1.6232bdd7abcd3p+9
-double set_exceptional(double x) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-  uint64_t x_abs = xbits.abs().uintval();
-
-  // |x| <= 2^-53
-  if (x_abs <= 0x3ca0'0000'0000'0000ULL) {
-    // exp(x) ~ 1 + x
-    return 1 + x;
-  }
-
-  // x <= log(2^-1075) || x >= 0x1.6232bdd7abcd3p+9 or inf/nan.
-
-  // x <= log(2^-1075) or -inf/nan
-  if (x_u >= 0xc087'4910'd52d'3052ULL) {
-    // exp(-Inf) = 0
-    if (xbits.is_inf())
-      return 0.0;
-
-    // exp(nan) = nan
-    if (xbits.is_nan())
-      return x;
-
-    if (fputil::quick_get_round() == FE_UPWARD)
-      return FPBits::min_subnormal().get_val();
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_UNDERFLOW);
-    return 0.0;
-  }
-
-  // x >= round(log(MAX_NORMAL), D, RU) = 0x1.62e42fefa39fp+9 or +inf/nan
-  // x is finite
-  if (x_u < 0x7ff0'0000'0000'0000ULL) {
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_OVERFLOW);
-  }
-  // x is +inf or nan
-  return x + FPBits::inf().get_val();
-}
-
-} // namespace
-
-LLVM_LIBC_FUNCTION(double, exp, (double x)) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-
-  // Upper bound: max normal number = 2^1023 * (2 - 2^-52)
-  // > round(log (2^1023 ( 2 - 2^-52 )), D, RU) = 0x1.62e42fefa39fp+9
-  // > round(log (2^1023 ( 2 - 2^-52 )), D, RD) = 0x1.62e42fefa39efp+9
-  // > round(log (2^1023 ( 2 - 2^-52 )), D, RN) = 0x1.62e42fefa39efp+9
-  // > round(exp(0x1.62e42fefa39fp+9), D, RN) = infty
-
-  // Lower bound: min denormal number / 2 = 2^-1075
-  // > round(log(2^-1075), D, RN) = -0x1.74910d52d3052p9
-
-  // Another lower bound: min normal number = 2^-1022
-  // > round(log(2^-1022), D, RN) = -0x1.6232bdd7abcd2p9
-
-  // x < log(2^-1075) or x >= 0x1.6232bdd7abcd3p+9 or |x| < 2^-53.
-  if (LIBC_UNLIKELY(x_u >= 0xc0874910d52d3052 ||
-                    (x_u < 0xbca0000000000000 && x_u >= 0x40862e42fefa39f0) ||
-                    x_u < 0x3ca0000000000000)) {
-    return set_exceptional(x);
-  }
-
-  // Now log(2^-1075) <= x <= -2^-53 or 2^-53 <= x < log(2^1023 * (2 - 2^-52))
-
-  // Range reduction:
-  // Let x = log(2) * (hi + mid1 + mid2) + lo
-  // in which:
-  //   hi is an integer
-  //   mid1 * 2^6 is an integer
-  //   mid2 * 2^12 is an integer
-  // then:
-  //   exp(x) = 2^hi * 2^(mid1) * 2^(mid2) * exp(lo).
-  // With this formula:
-  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
-  //     field.
-  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
-  //   - exp(lo) ~ 1 + lo + a0 * lo^2 + ...
-  //
-  // They can be defined by:
-  //   hi + mid1 + mid2 = 2^(-12) * round(2^12 * log_2(e) * x)
-  // If we store L2E = round(log2(e), D, RN), then:
-  //   log2(e) - L2E ~ 1.5 * 2^(-56)
-  // So the errors when computing in double precision is:
-  //   | x * 2^12 * log_2(e) - D(x * 2^12 * L2E) | <=
-  //  <= | x * 2^12 * log_2(e) - x * 2^12 * L2E | +
-  //     + | x * 2^12 * L2E - D(x * 2^12 * L2E) |
-  //  <= 2^12 * ( |x| * 1.5 * 2^-56 + eps(x))  for RN
-  //     2^12 * ( |x| * 1.5 * 2^-56 + 2*eps(x)) for other rounding modes.
-  // So if:
-  //   hi + mid1 + mid2 = 2^(-12) * round(x * 2^12 * L2E) is computed entirely
-  // in double precision, the reduced argument:
-  //   lo = x - log(2) * (hi + mid1 + mid2) is bounded by:
-  //   |lo| <= 2^-13 + (|x| * 1.5 * 2^-56 + 2*eps(x))
-  //         < 2^-13 + (1.5 * 2^9 * 1.5 * 2^-56 + 2*2^(9 - 52))
-  //         < 2^-13 + 2^-41
-  //
-
-  // The following trick computes the round(x * L2E) more efficiently
-  // than using the rounding instructions, with the tradeoff for less accuracy,
-  // and hence a slightly larger range for the reduced argument `lo`.
-  //
-  // To be precise, since |x| < |log(2^-1075)| < 1.5 * 2^9,
-  //   |x * 2^12 * L2E| < 1.5 * 2^9 * 1.5 < 2^23,
-  // So we can fit the rounded result round(x * 2^12 * L2E) in int32_t.
-  // Thus, the goal is to be able to use an additional addition and fixed width
-  // shift to get an int32_t representing round(x * 2^12 * L2E).
-  //
-  // Assuming int32_t using 2-complement representation, since the mantissa part
-  // of a double precision is unsigned with the leading bit hidden, if we add an
-  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
-  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
-  // considered as a proper 2-complement representations of x*2^12*L2E.
-  //
-  // One small problem with this approach is that the sum (x*2^12*L2E + C) in
-  // double precision is rounded to the least significant bit of the dorminant
-  // factor C.  In order to minimize the rounding errors from this addition, we
-  // want to minimize e1.  Another constraint that we want is that after
-  // shifting the mantissa so that the least significant bit of int32_t
-  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
-  // any adjustment.  So combining these 2 requirements, we can choose
-  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
-  // after right shifting the mantissa, the resulting int32_t has correct sign.
-  // With this choice of C, the number of mantissa bits we need to shift to the
-  // right is: 52 - 33 = 19.
-  //
-  // Moreover, since the integer right shifts are equivalent to rounding down,
-  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
-  // +infinity.  So in particular, we can compute:
-  //   hmm = x * 2^12 * L2E + C,
-  // where C = 2^33 + 2^32 + 2^-1, then if
-  //   k = int32_t(lower 51 bits of double(x * 2^12 * L2E + C) >> 19),
-  // the reduced argument:
-  //   lo = x - log(2) * 2^-12 * k is bounded by:
-  //   |lo| <= 2^-13 + 2^-41 + 2^-12*2^-19
-  //         = 2^-13 + 2^-31 + 2^-41.
-  //
-  // Finally, notice that k only uses the mantissa of x * 2^12 * L2E, so the
-  // exponent 2^12 is not needed.  So we can simply define
-  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
-  //   k = int32_t(lower 51 bits of double(x * L2E + C) >> 19).
-
-  // Rounding errors <= 2^-31 + 2^-41.
-  double tmp = fputil::multiply_add(x, LOG2_E, 0x1.8000'0000'4p21);
-  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-  int hi = k >> 12;
-
-  bool denorm = (hi <= -1022);
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |x - (hi + mid1 + mid2) * log(2) - dx| < 2^11 * eps(M_LOG_2_EXP2_M12.lo)
-  //                                        = 2^11 * 2^-13 * 2^-52
-  //                                        = 2^-54.
-  // |dx| < 2^-13 + 2^-30.
-  double lo_h = fputil::multiply_add(kd, MLOG_2_EXP2_M12_HI, x); // exact
-  double dx = fputil::multiply_add(kd, MLOG_2_EXP2_M12_MID, lo_h);
-
-  // We use the degree-4 Taylor polynomial to approximate exp(lo):
-  //   exp(lo) ~ 1 + lo + lo^2 / 2 + lo^3 / 6 + lo^4 / 24 = 1 + lo * P(lo)
-  // So that the errors are bounded by:
-  //   |P(lo) - expm1(lo)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
-  // Let P_ be an evaluation of P where all intermediate computations are in
-  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
-  // errors can be bounded by:
-  //      |P_(dx) - P(dx)| < 2^-51
-  //   => |dx * P_(dx) - expm1(lo) | < 1.5 * 2^-64
-  //   => 2^(mid1 + mid2) * |dx * P_(dx) - expm1(lo)| < 1.5 * 2^-63.
-  // Since we approximate
-  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
-  // We use the expression:
-  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
-  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
-  // with errors bounded by 1.5 * 2^-63.
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate expm1(dx)/dx ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (LIBC_UNLIKELY(denorm)) {
-    return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
-        .value();
-  } else {
-    // to multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r =
-        cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
-    return r;
-  }
-#else
-  if (LIBC_UNLIKELY(denorm)) {
-    if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
-        LIBC_LIKELY(r.has_value()))
-      return r.value();
-  } else {
-    double upper = exp_mid.hi + (lo + ERR_D);
-    double lower = exp_mid.hi + (lo - ERR_D);
-
-    if (LIBC_LIKELY(upper == lower)) {
-      // to multiply by 2^hi, a fast way is to simply add hi to the exponent
-      // field.
-      int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-      double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
-      return r;
-    }
-  }
-
-  // Use double-double
-  DoubleDouble r_dd = exp_double_double(x, kd, exp_mid);
-
-  if (LIBC_UNLIKELY(denorm)) {
-    if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
-        LIBC_LIKELY(r.has_value()))
-      return r.value();
-  } else {
-    double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
-    double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
-
-    if (LIBC_LIKELY(upper_dd == lower_dd)) {
-      int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-      double r =
-          cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
-      return r;
-    }
-  }
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp_f128(x, kd, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
+LLVM_LIBC_FUNCTION(double, exp, (double x)) { return math::exp(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index c464979b092c3..5c36d28c166ae 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -7,491 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10.h"
-#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
-#include "explogxf.h"         // ziv_test_denorm.
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/dyadic_float.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/common.h"
-#include "src/__support/integer_literals.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/exp10.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using fputil::DoubleDouble;
-using fputil::TripleDouble;
-using Float128 = typename fputil::DyadicFloat<128>;
-
-using LIBC_NAMESPACE::operator""_u128;
-
-// log2(10)
-constexpr double LOG2_10 = 0x1.a934f0979a371p+1;
-
-// -2^-12 * log10(2)
-// > a = -2^-12 * log10(2);
-// > b = round(a, 32, RN);
-// > c = round(a - b, 32, RN);
-// > d = round(a - b - c, D, RN);
-// Errors < 1.5 * 2^-144
-constexpr double MLOG10_2_EXP2_M12_HI = -0x1.3441350ap-14;
-constexpr double MLOG10_2_EXP2_M12_MID = 0x1.0c0219dc1da99p-51;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-constexpr double MLOG10_2_EXP2_M12_MID_32 = 0x1.0c0219dcp-51;
-constexpr double MLOG10_2_EXP2_M12_LO = 0x1.da994fd20dba2p-87;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// Error bounds:
-// Errors when using double precision.
-constexpr double ERR_D = 0x1.8p-63;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Errors when using double-double precision.
-constexpr double ERR_DD = 0x1.8p-99;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-namespace {
-
-// Polynomial approximations with double precision.  Generated by Sollya with:
-// > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]);
-// > P;
-// Error bounds:
-//   | output - (10^dx - 1) / dx | < 2^-52.
-LIBC_INLINE double poly_approx_d(double dx) {
-  // dx^2
-  double dx2 = dx * dx;
-  double c0 =
-      fputil::multiply_add(dx, 0x1.53524c73cea6ap+1, 0x1.26bb1bbb55516p+1);
-  double c1 =
-      fputil::multiply_add(dx, 0x1.2bd75cc6afc65p+0, 0x1.0470587aa264cp+1);
-  double p = fputil::multiply_add(dx2, c1, c0);
-  return p;
-}
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Polynomial approximation with double-double precision.  Generated by Solya
-// with:
-// > P = fpminimax((10^x - 1)/x, 5, [|DD...|], [-2^-14, 2^-14]);
-// Error bounds:
-//   | output - 10^(dx) | < 2^-101
-DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
-  // Taylor polynomial.
-  constexpr DoubleDouble COEFFS[] = {
-      {0, 0x1p0},
-      {-0x1.f48ad494e927bp-53, 0x1.26bb1bbb55516p1},
-      {-0x1.e2bfab3191cd2p-53, 0x1.53524c73cea69p1},
-      {0x1.80fb65ec3b503p-53, 0x1.0470591de2ca4p1},
-      {0x1.338fc05e21e55p-54, 0x1.2bd7609fd98c4p0},
-      {0x1.d4ea116818fbp-56, 0x1.1429ffd519865p-1},
-      {-0x1.872a8ff352077p-57, 0x1.a7ed70847c8b3p-3},
-
-  };
-
-  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
-                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
-  return p;
-}
-
-// Polynomial approximation with 128-bit precision:
-// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
-// For |dx| < 2^-14:
-//   | output - 10^dx | < 1.5 * 2^-124.
-Float128 poly_approx_f128(const Float128 &dx) {
-  constexpr Float128 COEFFS_128[]{
-      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
-      {Sign::POS, -126, 0x935d8ddd'aaa8ac16'ea56d62b'82d30a2d_u128},
-      {Sign::POS, -126, 0xa9a92639'e753443a'80a99ce7'5f4d5bdb_u128},
-      {Sign::POS, -126, 0x82382c8e'f1652304'6a4f9d7d'bf6c9635_u128},
-      {Sign::POS, -124, 0x12bd7609'fd98c44c'34578701'9216c7af_u128},
-      {Sign::POS, -127, 0x450a7ff4'7535d889'cc41ed7e'0d27aee5_u128},
-      {Sign::POS, -130, 0xd3f6b844'702d636b'8326bb91'a6e7601d_u128},
-      {Sign::POS, -130, 0x45b937f0'd05bb1cd'fa7b46df'314112a9_u128},
-  };
-
-  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
-                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
-                                COEFFS_128[6], COEFFS_128[7]);
-  return p;
-}
-
-// Compute 10^(x) using 128-bit precision.
-// TODO(lntue): investigate triple-double precision implementation for this
-// step.
-Float128 exp10_f128(double x, double kd, int idx1, int idx2) {
-  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
-  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-144
-
-  Float128 dx = fputil::quick_add(
-      Float128(t1), fputil::quick_add(Float128(t2), Float128(t3)));
-
-  // TODO: Skip recalculating exp_mid1 and exp_mid2.
-  Float128 exp_mid1 =
-      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
-                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
-                                          Float128(EXP2_MID1[idx1].lo)));
-
-  Float128 exp_mid2 =
-      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
-                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
-                                          Float128(EXP2_MID2[idx2].lo)));
-
-  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
-
-  Float128 p = poly_approx_f128(dx);
-
-  Float128 r = fputil::quick_mul(exp_mid, p);
-
-  r.exponent += static_cast<int>(kd) >> 12;
-
-  return r;
-}
-
-// Compute 10^x with double-double precision.
-DoubleDouble exp10_double_double(double x, double kd,
-                                 const DoubleDouble &exp_mid) {
-  // Recalculate dx:
-  //   dx = x - k * 2^-12 * log10(2)
-  double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double t2 = kd * MLOG10_2_EXP2_M12_MID_32;                     // exact
-  double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-140
-
-  DoubleDouble dx = fputil::exact_add(t1, t2);
-  dx.lo += t3;
-
-  // Degree-6 polynomial approximation in double-double precision.
-  // | p - 10^x | < 2^-103.
-  DoubleDouble p = poly_approx_dd(dx);
-
-  // Error bounds: 2^-102.
-  DoubleDouble r = fputil::quick_mult(exp_mid, p);
-
-  return r;
-}
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// When output is denormal.
-double exp10_denorm(double x) {
-  // Range reduction.
-  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
-  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
-  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
-      .value();
-#else
-  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use double-double
-  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
-
-  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
-
-// Check for exceptional cases when:
-//  * log10(1 - 2^-54) < x < log10(1 + 2^-53)
-//  * x >= log10(2^1024)
-//  * x <= log10(2^-1022)
-//  * x is inf or nan
-double set_exceptional(double x) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-  uint64_t x_abs = xbits.abs().uintval();
-
-  // |x| < log10(1 + 2^-53)
-  if (x_abs <= 0x3c8bcb7b1526e50e) {
-    // 10^(x) ~ 1 + x/2
-    return fputil::multiply_add(x, 0.5, 1.0);
-  }
-
-  // x <= log10(2^-1022) || x >= log10(2^1024) or inf/nan.
-  if (x_u >= 0xc0733a7146f72a42) {
-    // x <= log10(2^-1075) or -inf/nan
-    if (x_u > 0xc07439b746e36b52) {
-      // exp(-Inf) = 0
-      if (xbits.is_inf())
-        return 0.0;
-
-      // exp(nan) = nan
-      if (xbits.is_nan())
-        return x;
-
-      if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_subnormal().get_val();
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW);
-      return 0.0;
-    }
-
-    return exp10_denorm(x);
-  }
-
-  // x >= log10(2^1024) or +inf/nan
-  // x is finite
-  if (x_u < 0x7ff0'0000'0000'0000ULL) {
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_OVERFLOW);
-  }
-  // x is +inf or nan
-  return x + FPBits::inf().get_val();
-}
-
-} // namespace
-
-LLVM_LIBC_FUNCTION(double, exp10, (double x)) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-
-  // x <= log10(2^-1022) or x >= log10(2^1024) or
-  // log10(1 - 2^-54) < x < log10(1 + 2^-53).
-  if (LIBC_UNLIKELY(x_u >= 0xc0733a7146f72a42 ||
-                    (x_u <= 0xbc7bcb7b1526e50e && x_u >= 0x40734413509f79ff) ||
-                    x_u < 0x3c8bcb7b1526e50e)) {
-    return set_exceptional(x);
-  }
-
-  // Now log10(2^-1075) < x <= log10(1 - 2^-54) or
-  //     log10(1 + 2^-53) < x < log10(2^1024)
-
-  // Range reduction:
-  // Let x = log10(2) * (hi + mid1 + mid2) + lo
-  // in which:
-  //   hi is an integer
-  //   mid1 * 2^6 is an integer
-  //   mid2 * 2^12 is an integer
-  // then:
-  //   10^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 10^(lo).
-  // With this formula:
-  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
-  //     field.
-  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
-  //   - 10^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
-  //
-  // We compute (hi + mid1 + mid2) together by perform the rounding on
-  //   x * log2(10) * 2^12.
-  // Since |x| < |log10(2^-1075)| < 2^9,
-  //   |x * 2^12| < 2^9 * 2^12 < 2^21,
-  // So we can fit the rounded result round(x * 2^12) in int32_t.
-  // Thus, the goal is to be able to use an additional addition and fixed width
-  // shift to get an int32_t representing round(x * 2^12).
-  //
-  // Assuming int32_t using 2-complement representation, since the mantissa part
-  // of a double precision is unsigned with the leading bit hidden, if we add an
-  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^23 to the product, the
-  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
-  // considered as a proper 2-complement representations of x*2^12.
-  //
-  // One small problem with this approach is that the sum (x*2^12 + C) in
-  // double precision is rounded to the least significant bit of the dorminant
-  // factor C.  In order to minimize the rounding errors from this addition, we
-  // want to minimize e1.  Another constraint that we want is that after
-  // shifting the mantissa so that the least significant bit of int32_t
-  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
-  // any adjustment.  So combining these 2 requirements, we can choose
-  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
-  // after right shifting the mantissa, the resulting int32_t has correct sign.
-  // With this choice of C, the number of mantissa bits we need to shift to the
-  // right is: 52 - 33 = 19.
-  //
-  // Moreover, since the integer right shifts are equivalent to rounding down,
-  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
-  // +infinity.  So in particular, we can compute:
-  //   hmm = x * 2^12 + C,
-  // where C = 2^33 + 2^32 + 2^-1, then if
-  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
-  // the reduced argument:
-  //   lo = x - log10(2) * 2^-12 * k is bounded by:
-  //   |lo|  = |x - log10(2) * 2^-12 * k|
-  //         = log10(2) * 2^-12 * | x * log2(10) * 2^12 - k |
-  //        <= log10(2) * 2^-12 * (2^-1 + 2^-19)
-  //         < 1.5 * 2^-2 * (2^-13 + 2^-31)
-  //         = 1.5 * (2^-15 * 2^-31)
-  //
-  // Finally, notice that k only uses the mantissa of x * 2^12, so the
-  // exponent 2^12 is not needed.  So we can simply define
-  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
-  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
-
-  // Rounding errors <= 2^-31.
-  double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21);
-  int k = static_cast<int>(cpp::bit_cast<uint64_t>(tmp) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14
-  double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact
-  double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h);
-
-  // We use the degree-4 polynomial to approximate 10^(lo):
-  //   10^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4
-  //           = 1 + lo * P(lo)
-  // So that the errors are bounded by:
-  //   |P(lo) - (10^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
-  // Let P_ be an evaluation of P where all intermediate computations are in
-  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
-  // errors can be bounded by:
-  //      |P_(lo) - P(lo)| < 2^-51
-  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-65
-  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-64.
-  // Since we approximate
-  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
-  // We use the expression:
-  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
-  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
-  // with errors bounded by 2^-64.
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-  double r =
-      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
-  return r;
-#else
-  double upper = exp_mid.hi + (lo + ERR_D);
-  double lower = exp_mid.hi + (lo - ERR_D);
-
-  if (LIBC_LIKELY(upper == lower)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
-    return r;
-  }
-
-  // Exact outputs when x = 1, 2, ..., 22 + hard to round with x = 23.
-  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0 | ... | bits of 23.0)
-  if (LIBC_UNLIKELY((x_u & 0x8000'ffff'ffff'ffffULL) == 0ULL)) {
-    switch (x_u) {
-    case 0x3ff0000000000000: // x = 1.0
-      return 10.0;
-    case 0x4000000000000000: // x = 2.0
-      return 100.0;
-    case 0x4008000000000000: // x = 3.0
-      return 1'000.0;
-    case 0x4010000000000000: // x = 4.0
-      return 10'000.0;
-    case 0x4014000000000000: // x = 5.0
-      return 100'000.0;
-    case 0x4018000000000000: // x = 6.0
-      return 1'000'000.0;
-    case 0x401c000000000000: // x = 7.0
-      return 10'000'000.0;
-    case 0x4020000000000000: // x = 8.0
-      return 100'000'000.0;
-    case 0x4022000000000000: // x = 9.0
-      return 1'000'000'000.0;
-    case 0x4024000000000000: // x = 10.0
-      return 10'000'000'000.0;
-    case 0x4026000000000000: // x = 11.0
-      return 100'000'000'000.0;
-    case 0x4028000000000000: // x = 12.0
-      return 1'000'000'000'000.0;
-    case 0x402a000000000000: // x = 13.0
-      return 10'000'000'000'000.0;
-    case 0x402c000000000000: // x = 14.0
-      return 100'000'000'000'000.0;
-    case 0x402e000000000000: // x = 15.0
-      return 1'000'000'000'000'000.0;
-    case 0x4030000000000000: // x = 16.0
-      return 10'000'000'000'000'000.0;
-    case 0x4031000000000000: // x = 17.0
-      return 100'000'000'000'000'000.0;
-    case 0x4032000000000000: // x = 18.0
-      return 1'000'000'000'000'000'000.0;
-    case 0x4033000000000000: // x = 19.0
-      return 10'000'000'000'000'000'000.0;
-    case 0x4034000000000000: // x = 20.0
-      return 100'000'000'000'000'000'000.0;
-    case 0x4035000000000000: // x = 21.0
-      return 1'000'000'000'000'000'000'000.0;
-    case 0x4036000000000000: // x = 22.0
-      return 10'000'000'000'000'000'000'000.0;
-    case 0x4037000000000000: // x = 23.0
-      return 0x1.52d02c7e14af6p76 + x;
-    }
-  }
-
-  // Use double-double
-  DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid);
-
-  double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
-  double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
-
-  if (LIBC_LIKELY(upper_dd == lower_dd)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
-    return r;
-  }
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp10_f128(x, kd, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
+LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return math::exp10(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10f.cpp b/libc/src/math/generic/exp10f.cpp
index 5284c380f52ec..b2d4f097bc7ce 100644
--- a/libc/src/math/generic/exp10f.cpp
+++ b/libc/src/math/generic/exp10f.cpp
@@ -7,12 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10f.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/math/generic/exp10f_impl.h"
+
+#include "src/__support/math/exp10f.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return generic::exp10f(x); }
+LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return math::exp10f(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/explogxf.cpp b/libc/src/math/generic/explogxf.cpp
deleted file mode 100644
index d38efa0269693..0000000000000
--- a/libc/src/math/generic/explogxf.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- Single-precision general exp/log functions ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "explogxf.h"
-#include "src/__support/macros/config.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-// N[Table[Log[2, 1 + x], {x, 0/64, 63/64, 1/64}], 40]
-alignas(8) const double LOG_P1_LOG2[LOG_P1_SIZE] = {
-    0x0.0000000000000p+0, 0x1.6e79685c2d22ap-6, 0x1.6bad3758efd87p-5,
-    0x1.0eb389fa29f9bp-4, 0x1.663f6fac91316p-4, 0x1.bc84240adabbap-4,
-    0x1.08c588cda79e4p-3, 0x1.32ae9e278ae1ap-3, 0x1.5c01a39fbd688p-3,
-    0x1.84c2bd02f03b3p-3, 0x1.acf5e2db4ec94p-3, 0x1.d49ee4c325970p-3,
-    0x1.fbc16b902680ap-3, 0x1.11307dad30b76p-2, 0x1.24407ab0e073ap-2,
-    0x1.37124cea4cdedp-2, 0x1.49a784bcd1b8bp-2, 0x1.5c01a39fbd688p-2,
-    0x1.6e221cd9d0cdep-2, 0x1.800a563161c54p-2, 0x1.91bba891f1709p-2,
-    0x1.a33760a7f6051p-2, 0x1.b47ebf73882a1p-2, 0x1.c592fad295b56p-2,
-    0x1.d6753e032ea0fp-2, 0x1.e726aa1e754d2p-2, 0x1.f7a8568cb06cfp-2,
-    0x1.03fda8b97997fp-1, 0x1.0c10500d63aa6p-1, 0x1.140c9faa1e544p-1,
-    0x1.1bf311e95d00ep-1, 0x1.23c41d42727c8p-1, 0x1.2b803473f7ad1p-1,
-    0x1.3327c6ab49ca7p-1, 0x1.3abb3faa02167p-1, 0x1.423b07e986aa9p-1,
-    0x1.49a784bcd1b8bp-1, 0x1.510118708a8f9p-1, 0x1.5848226989d34p-1,
-    0x1.5f7cff41e09afp-1, 0x1.66a008e4788ccp-1, 0x1.6db196a76194ap-1,
-    0x1.74b1fd64e0754p-1, 0x1.7ba18f93502e4p-1, 0x1.82809d5be7073p-1,
-    0x1.894f74b06ef8bp-1, 0x1.900e6160002cdp-1, 0x1.96bdad2acb5f6p-1,
-    0x1.9d5d9fd5010b3p-1, 0x1.a3ee7f38e181fp-1, 0x1.aa708f58014d3p-1,
-    0x1.b0e4126bcc86cp-1, 0x1.b74948f5532dap-1, 0x1.bda071cc67e6ep-1,
-    0x1.c3e9ca2e1a055p-1, 0x1.ca258dca93316p-1, 0x1.d053f6d260896p-1,
-    0x1.d6753e032ea0fp-1, 0x1.dc899ab3ff56cp-1, 0x1.e29142e0e0140p-1,
-    0x1.e88c6b3626a73p-1, 0x1.ee7b471b3a950p-1, 0x1.f45e08bcf0655p-1,
-    0x1.fa34e1177c233p-1,
-};
-
-// N[Table[1/(1 + x), {x, 0/64, 63/64, 1/64}], 40]
-alignas(8) const double LOG_P1_1_OVER[LOG_P1_SIZE] = {
-    0x1.0000000000000p+0, 0x1.f81f81f81f820p-1, 0x1.f07c1f07c1f08p-1,
-    0x1.e9131abf0b767p-1, 0x1.e1e1e1e1e1e1ep-1, 0x1.dae6076b981dbp-1,
-    0x1.d41d41d41d41dp-1, 0x1.cd85689039b0bp-1, 0x1.c71c71c71c71cp-1,
-    0x1.c0e070381c0e0p-1, 0x1.bacf914c1bad0p-1, 0x1.b4e81b4e81b4fp-1,
-    0x1.af286bca1af28p-1, 0x1.a98ef606a63bep-1, 0x1.a41a41a41a41ap-1,
-    0x1.9ec8e951033d9p-1, 0x1.999999999999ap-1, 0x1.948b0fcd6e9e0p-1,
-    0x1.8f9c18f9c18fap-1, 0x1.8acb90f6bf3aap-1, 0x1.8618618618618p-1,
-    0x1.8181818181818p-1, 0x1.7d05f417d05f4p-1, 0x1.78a4c8178a4c8p-1,
-    0x1.745d1745d1746p-1, 0x1.702e05c0b8170p-1, 0x1.6c16c16c16c17p-1,
-    0x1.6816816816817p-1, 0x1.642c8590b2164p-1, 0x1.6058160581606p-1,
-    0x1.5c9882b931057p-1, 0x1.58ed2308158edp-1, 0x1.5555555555555p-1,
-    0x1.51d07eae2f815p-1, 0x1.4e5e0a72f0539p-1, 0x1.4afd6a052bf5bp-1,
-    0x1.47ae147ae147bp-1, 0x1.446f86562d9fbp-1, 0x1.4141414141414p-1,
-    0x1.3e22cbce4a902p-1, 0x1.3b13b13b13b14p-1, 0x1.3813813813814p-1,
-    0x1.3521cfb2b78c1p-1, 0x1.323e34a2b10bfp-1, 0x1.2f684bda12f68p-1,
-    0x1.2c9fb4d812ca0p-1, 0x1.29e4129e4129ep-1, 0x1.27350b8812735p-1,
-    0x1.2492492492492p-1, 0x1.21fb78121fb78p-1, 0x1.1f7047dc11f70p-1,
-    0x1.1cf06ada2811dp-1, 0x1.1a7b9611a7b96p-1, 0x1.1811811811812p-1,
-    0x1.15b1e5f75270dp-1, 0x1.135c81135c811p-1, 0x1.1111111111111p-1,
-    0x1.0ecf56be69c90p-1, 0x1.0c9714fbcda3bp-1, 0x1.0a6810a6810a7p-1,
-    0x1.0842108421084p-1, 0x1.0624dd2f1a9fcp-1, 0x1.0410410410410p-1,
-    0x1.0204081020408p-1};
-
-// Taylos series expansion for Log[2, 1 + x] splitted to EVEN AND ODD numbers
-// K_LOG2_ODD starts from x^3
-alignas(8) const
-    double K_LOG2_ODD[4] = {0x1.ec709dc3a03fdp-2, 0x1.2776c50ef9bfep-2,
-                            0x1.a61762a7aded9p-3, 0x1.484b13d7c02a9p-3};
-
-alignas(8) const
-    double K_LOG2_EVEN[4] = {-0x1.71547652b82fep-1, -0x1.71547652b82fep-2,
-                             -0x1.ec709dc3a03fdp-3, -0x1.2776c50ef9bfep-3};
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/explogxf.h b/libc/src/math/generic/explogxf.h
index 212ede4758549..be4328a4f48b5 100644
--- a/libc/src/math/generic/explogxf.h
+++ b/libc/src/math/generic/explogxf.h
@@ -10,167 +10,17 @@
 #define LLVM_LIBC_SRC_MATH_GENERIC_EXPLOGXF_H
 
 #include "common_constants.h"
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/nearest_integer.h"
+
 #include "src/__support/common.h"
-#include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/exp10f_utils.h"
+#include "src/__support/math/exp_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-struct ExpBase {
-  // Base = e
-  static constexpr int MID_BITS = 5;
-  static constexpr int MID_MASK = (1 << MID_BITS) - 1;
-  // log2(e) * 2^5
-  static constexpr double LOG2_B = 0x1.71547652b82fep+0 * (1 << MID_BITS);
-  // High and low parts of -log(2) * 2^(-5)
-  static constexpr double M_LOGB_2_HI = -0x1.62e42fefa0000p-1 / (1 << MID_BITS);
-  static constexpr double M_LOGB_2_LO =
-      -0x1.cf79abc9e3b3ap-40 / (1 << MID_BITS);
-  // Look up table for bit fields of 2^(i/32) for i = 0..31, generated by Sollya
-  // with:
-  // > for i from 0 to 31 do printdouble(round(2^(i/32), D, RN));
-  static constexpr int64_t EXP_2_MID[1 << MID_BITS] = {
-      0x3ff0000000000000, 0x3ff059b0d3158574, 0x3ff0b5586cf9890f,
-      0x3ff11301d0125b51, 0x3ff172b83c7d517b, 0x3ff1d4873168b9aa,
-      0x3ff2387a6e756238, 0x3ff29e9df51fdee1, 0x3ff306fe0a31b715,
-      0x3ff371a7373aa9cb, 0x3ff3dea64c123422, 0x3ff44e086061892d,
-      0x3ff4bfdad5362a27, 0x3ff5342b569d4f82, 0x3ff5ab07dd485429,
-      0x3ff6247eb03a5585, 0x3ff6a09e667f3bcd, 0x3ff71f75e8ec5f74,
-      0x3ff7a11473eb0187, 0x3ff82589994cce13, 0x3ff8ace5422aa0db,
-      0x3ff93737b0cdc5e5, 0x3ff9c49182a3f090, 0x3ffa5503b23e255d,
-      0x3ffae89f995ad3ad, 0x3ffb7f76f2fb5e47, 0x3ffc199bdd85529c,
-      0x3ffcb720dcef9069, 0x3ffd5818dcfba487, 0x3ffdfc97337b9b5f,
-      0x3ffea4afa2a490da, 0x3fff50765b6e4540,
-  };
-
-  // Approximating e^dx with degree-5 minimax polynomial generated by Sollya:
-  // > Q = fpminimax(expm1(x)/x, 4, [|1, D...|], [-log(2)/64, log(2)/64]);
-  // Then:
-  //   e^dx ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[3] * dx^5.
-  static constexpr double COEFFS[4] = {
-      0x1.ffffffffe5bc8p-2, 0x1.555555555cd67p-3, 0x1.5555c2a9b48b4p-5,
-      0x1.11112a0e34bdbp-7};
-
-  LIBC_INLINE static double powb_lo(double dx) {
-    using fputil::multiply_add;
-    double dx2 = dx * dx;
-    double c0 = 1.0 + dx;
-    // c1 = COEFFS[0] + COEFFS[1] * dx
-    double c1 = multiply_add(dx, ExpBase::COEFFS[1], ExpBase::COEFFS[0]);
-    // c2 = COEFFS[2] + COEFFS[3] * dx
-    double c2 = multiply_add(dx, ExpBase::COEFFS[3], ExpBase::COEFFS[2]);
-    // r = c4 + c5 * dx^4
-    //   = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[5] * dx^7
-    return fputil::polyeval(dx2, c0, c1, c2);
-  }
-};
-
-struct Exp10Base : public ExpBase {
-  // log2(10) * 2^5
-  static constexpr double LOG2_B = 0x1.a934f0979a371p1 * (1 << MID_BITS);
-  // High and low parts of -log10(2) * 2^(-5).
-  // Notice that since |x * log2(10)| < 150:
-  //   |k| = |round(x * log2(10) * 2^5)| < 2^8 * 2^5 = 2^13
-  // So when the FMA instructions are not available, in order for the product
-  //   k * M_LOGB_2_HI
-  // to be exact, we only store the high part of log10(2) up to 38 bits
-  // (= 53 - 15) of precision.
-  // It is generated by Sollya with:
-  // > round(log10(2), 44, RN);
-  static constexpr double M_LOGB_2_HI = -0x1.34413509f8p-2 / (1 << MID_BITS);
-  // > round(log10(2) - 0x1.34413509f8p-2, D, RN);
-  static constexpr double M_LOGB_2_LO = 0x1.80433b83b532ap-44 / (1 << MID_BITS);
-
-  // Approximating 10^dx with degree-5 minimax polynomial generated by Sollya:
-  // > Q = fpminimax((10^x - 1)/x, 4, [|D...|], [-log10(2)/2^6, log10(2)/2^6]);
-  // Then:
-  //   10^dx ~ P(dx) = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
-  static constexpr double COEFFS[5] = {0x1.26bb1bbb55515p1, 0x1.53524c73bd3eap1,
-                                       0x1.0470591dff149p1, 0x1.2bd7c0a9fbc4dp0,
-                                       0x1.1429e74a98f43p-1};
-
-  static double powb_lo(double dx) {
-    using fputil::multiply_add;
-    double dx2 = dx * dx;
-    // c0 = 1 + COEFFS[0] * dx
-    double c0 = multiply_add(dx, Exp10Base::COEFFS[0], 1.0);
-    // c1 = COEFFS[1] + COEFFS[2] * dx
-    double c1 = multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
-    // c2 = COEFFS[3] + COEFFS[4] * dx
-    double c2 = multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
-    // r = c0 + dx^2 * (c1 + c2 * dx^2)
-    //   = c0 + c1 * dx^2 + c2 * dx^4
-    //   = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5.
-    return fputil::polyeval(dx2, c0, c1, c2);
-  }
-};
-
 constexpr int LOG_P1_BITS = 6;
 constexpr int LOG_P1_SIZE = 1 << LOG_P1_BITS;
 
-// N[Table[Log[2, 1 + x], {x, 0/64, 63/64, 1/64}], 40]
-extern const double LOG_P1_LOG2[LOG_P1_SIZE];
-
-// N[Table[1/(1 + x), {x, 0/64, 63/64, 1/64}], 40]
-extern const double LOG_P1_1_OVER[LOG_P1_SIZE];
-
-// Taylor series expansion for Log[2, 1 + x] splitted to EVEN AND ODD numbers
-// K_LOG2_ODD starts from x^3
-extern const double K_LOG2_ODD[4];
-extern const double K_LOG2_EVEN[4];
-
-// Output of range reduction for exp_b: (2^(mid + hi), lo)
-// where:
-//   b^x = 2^(mid + hi) * b^lo
-struct exp_b_reduc_t {
-  double mh; // 2^(mid + hi)
-  double lo;
-};
-
-// The function correctly calculates b^x value with at least float precision
-// in a limited range.
-// Range reduction:
-//   b^x = 2^(hi + mid) * b^lo
-// where:
-//   x = (hi + mid) * log_b(2) + lo
-//   hi is an integer,
-//   0 <= mid * 2^MID_BITS < 2^MID_BITS is an integer
-//   -2^(-MID_BITS - 1) <= lo * log2(b) <= 2^(-MID_BITS - 1)
-// Base class needs to provide the following constants:
-//   - MID_BITS    : number of bits after decimal points used for mid
-//   - MID_MASK    : 2^MID_BITS - 1, mask to extract mid bits
-//   - LOG2_B      : log2(b) * 2^MID_BITS for scaling
-//   - M_LOGB_2_HI : high part of -log_b(2) * 2^(-MID_BITS)
-//   - M_LOGB_2_LO : low part of -log_b(2) * 2^(-MID_BITS)
-//   - EXP_2_MID   : look up table for bit fields of 2^mid
-// Return:
-//   { 2^(hi + mid), lo }
-template <class Base> LIBC_INLINE exp_b_reduc_t exp_b_range_reduc(float x) {
-  double xd = static_cast<double>(x);
-  // kd = round((hi + mid) * log2(b) * 2^MID_BITS)
-  double kd = fputil::nearest_integer(Base::LOG2_B * xd);
-  // k = round((hi + mid) * log2(b) * 2^MID_BITS)
-  int k = static_cast<int>(kd);
-  // hi = floor(kd * 2^(-MID_BITS))
-  // exp_hi = shift hi to the exponent field of double precision.
-  uint64_t exp_hi = static_cast<uint64_t>(k >> Base::MID_BITS)
-                    << fputil::FPBits<double>::FRACTION_LEN;
-  // mh = 2^hi * 2^mid
-  // mh_bits = bit field of mh
-  uint64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi;
-  double mh = fputil::FPBits<double>(mh_bits).get_val();
-  // dx = lo = x - (hi + mid) * log(2)
-  double dx = fputil::multiply_add(
-      kd, Base::M_LOGB_2_LO, fputil::multiply_add(kd, Base::M_LOGB_2_HI, xd));
-  return {mh, dx};
-}
-
 // The function correctly calculates sinh(x) and cosh(x) by calculating exp(x)
 // and exp(-x) simultaneously.
 // To compute e^x, we perform the following range
@@ -270,33 +120,6 @@ template <bool is_sinh> LIBC_INLINE double exp_pm_eval(float x) {
   return r;
 }
 
-// x should be positive, normal finite value
-LIBC_INLINE static double log2_eval(double x) {
-  using FPB = fputil::FPBits<double>;
-  FPB bs(x);
-
-  double result = 0;
-  result += bs.get_exponent();
-
-  int p1 = (bs.get_mantissa() >> (FPB::FRACTION_LEN - LOG_P1_BITS)) &
-           (LOG_P1_SIZE - 1);
-
-  bs.set_uintval(bs.uintval() & (FPB::FRACTION_MASK >> LOG_P1_BITS));
-  bs.set_biased_exponent(FPB::EXP_BIAS);
-  double dx = (bs.get_val() - 1.0) * LOG_P1_1_OVER[p1];
-
-  // Taylor series for log(2,1+x)
-  double c1 = fputil::multiply_add(dx, K_LOG2_ODD[0], K_LOG2_EVEN[0]);
-  double c2 = fputil::multiply_add(dx, K_LOG2_ODD[1], K_LOG2_EVEN[1]);
-  double c3 = fputil::multiply_add(dx, K_LOG2_ODD[2], K_LOG2_EVEN[2]);
-  double c4 = fputil::multiply_add(dx, K_LOG2_ODD[3], K_LOG2_EVEN[3]);
-
-  // c0 = dx * (1.0 / ln(2)) + LOG_P1_LOG2[p1]
-  double c0 = fputil::multiply_add(dx, 0x1.71547652b82fep+0, LOG_P1_LOG2[p1]);
-  result += LIBC_NAMESPACE::fputil::polyeval(dx * dx, c0, c1, c2, c3, c4);
-  return result;
-}
-
 // x should be positive, normal finite value
 // TODO: Simplify range reduction and polynomial degree for float16.
 //       See issue #137190.
@@ -375,58 +198,6 @@ LIBC_INLINE static double log_eval(double x) {
   return result;
 }
 
-// Rounding tests for 2^hi * (mid + lo) when the output might be denormal. We
-// assume further that 1 <= mid < 2, mid + lo < 2, and |lo| << mid.
-// Notice that, if 0 < x < 2^-1022,
-//   double(2^-1022 + x) - 2^-1022 = double(x).
-// So if we scale x up by 2^1022, we can use
-//   double(1.0 + 2^1022 * x) - 1.0 to test how x is rounded in denormal range.
-template <bool SKIP_ZIV_TEST = false>
-LIBC_INLINE static cpp::optional<double>
-ziv_test_denorm(int hi, double mid, double lo, double err) {
-  using FPBits = typename fputil::FPBits<double>;
-
-  // Scaling factor = 1/(min normal number) = 2^1022
-  int64_t exp_hi = static_cast<int64_t>(hi + 1022) << FPBits::FRACTION_LEN;
-  double mid_hi = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(mid));
-  double lo_scaled =
-      (lo != 0.0) ? cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(lo))
-                  : 0.0;
-
-  double extra_factor = 0.0;
-  uint64_t scale_down = 0x3FE0'0000'0000'0000; // 1022 in the exponent field.
-
-  // Result is denormal if (mid_hi + lo_scale < 1.0).
-  if ((1.0 - mid_hi) > lo_scaled) {
-    // Extra rounding step is needed, which adds more rounding errors.
-    err += 0x1.0p-52;
-    extra_factor = 1.0;
-    scale_down = 0x3FF0'0000'0000'0000; // 1023 in the exponent field.
-  }
-
-  // By adding 1.0, the results will have similar rounding points as denormal
-  // outputs.
-  if constexpr (SKIP_ZIV_TEST) {
-    double r = extra_factor + (mid_hi + lo_scaled);
-    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(r) - scale_down);
-  } else {
-    double err_scaled =
-        cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(err));
-
-    double lo_u = lo_scaled + err_scaled;
-    double lo_l = lo_scaled - err_scaled;
-
-    double upper = extra_factor + (mid_hi + lo_u);
-    double lower = extra_factor + (mid_hi + lo_l);
-
-    if (LIBC_LIKELY(upper == lower)) {
-      return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(upper) - scale_down);
-    }
-
-    return cpp::nullopt;
-  }
-}
-
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPLOGXF_H
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index dfdfd5d6d5760..a45ef511c9bad 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -9,20 +9,17 @@
 #include "src/math/powf.h"
 #include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/except_value_utils.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/FPUtil/sqrt.h" // Speedup for powf(x, 1/2) = sqrtf(x)
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/exp10f.h" // Speedup for powf(10, y) = exp10f(y)
 
-#include "exp10f_impl.h" // Speedup for powf(10, y) = exp10f(y)
 #include "exp2f_impl.h"  // Speedup for powf(2, y) = exp2f(y)
 
 namespace LIBC_NAMESPACE_DECL {
@@ -781,7 +778,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       return generic::exp2f(y);
     case 0x4120'0000: // x = 10.0f
       // pow(10, y) = exp10(y)
-      return generic::exp10f(y);
+      return math::exp10f(y);
 #endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
     }
 
diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp
index d6158fd302536..63111f84de141 100644
--- a/libc/src/math/generic/sinhf.cpp
+++ b/libc/src/math/generic/sinhf.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/sinhf.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index a967247db53f4..8ab1c9ff98d2f 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -7,7 +7,9 @@ add_header_library(
     aarch64/inline_memcpy.h
     aarch64/inline_memmove.h
     aarch64/inline_memset.h
+    arm/common.h
     arm/inline_memcpy.h
+    arm/inline_memset.h
     generic/aligned_access.h
     generic/byte_per_byte.h
     inline_bcmp.h
diff --git a/libc/src/string/memory_utils/arm/common.h b/libc/src/string/memory_utils/arm/common.h
new file mode 100644
index 0000000000000..b9f40b64fed98
--- /dev/null
+++ b/libc/src/string/memory_utils/arm/common.h
@@ -0,0 +1,55 @@
+//===-- Common constants and defines for arm --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H
+
+#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR
+#include "src/string/memory_utils/utils.h"   // CPtr, Ptr, distance_to_align
+
+#include <stddef.h> // size_t
+
+// Our minimum supported compiler version does not recognize the standard
+// [[likely]] / [[unlikely]] attributes so we use the preprocessor.
+
+// https://libc.llvm.org/compiler_support.html
+// Support for [[likely]] / [[unlikely]]
+//  [X] GCC 12.2
+//  [X] Clang 12
+//  [ ] Clang 11
+#define LIBC_ATTR_LIKELY [[likely]]
+#define LIBC_ATTR_UNLIKELY [[unlikely]]
+
+#if defined(LIBC_COMPILER_IS_CLANG)
+#if LIBC_COMPILER_CLANG_VER < 1200
+#undef LIBC_ATTR_LIKELY
+#undef LIBC_ATTR_UNLIKELY
+#define LIBC_ATTR_LIKELY
+#define LIBC_ATTR_UNLIKELY
+#endif
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);
+
+enum class AssumeAccess { kUnknown, kAligned };
+enum class BlockOp { kFull, kByWord };
+
+LIBC_INLINE auto misaligned(CPtr ptr) {
+  return distance_to_align_down<kWordSize>(ptr);
+}
+
+LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
+  return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
+                             cpp::bit_cast<uintptr_t>(b));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H
diff --git a/libc/src/string/memory_utils/arm/inline_memcpy.h b/libc/src/string/memory_utils/arm/inline_memcpy.h
index 61efebe29b485..c748048a3e586 100644
--- a/libc/src/string/memory_utils/arm/inline_memcpy.h
+++ b/libc/src/string/memory_utils/arm/inline_memcpy.h
@@ -5,63 +5,57 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// The functions defined in this file give approximate code size. These sizes
+// assume the following configuration options:
+// - LIBC_CONF_KEEP_FRAME_POINTER = false
+// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
+// - LIBC_ADD_NULL_CHECKS = false
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
 
+#include "src/__support/CPP/type_traits.h"     // always_false
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
 #include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
+#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
 #include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
 
 #include <stddef.h> // size_t
 
-// https://libc.llvm.org/compiler_support.html
-// Support for [[likely]] / [[unlikely]]
-//  [X] GCC 12.2
-//  [X] Clang 12
-//  [ ] Clang 11
-#define LIBC_ATTR_LIKELY [[likely]]
-#define LIBC_ATTR_UNLIKELY [[unlikely]]
-
-#if defined(LIBC_COMPILER_IS_CLANG)
-#if LIBC_COMPILER_CLANG_VER < 1200
-#undef LIBC_ATTR_LIKELY
-#undef LIBC_ATTR_UNLIKELY
-#define LIBC_ATTR_LIKELY
-#define LIBC_ATTR_UNLIKELY
-#endif
-#endif
-
 namespace LIBC_NAMESPACE_DECL {
 
 namespace {
 
-LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);
-
-enum Strategy {
-  ForceWordLdStChain,
-  AssumeWordAligned,
-  AssumeUnaligned,
-};
+// Performs a copy of `bytes` byte from `src` to `dst`. This function has the
+// semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
+// free to use whatever instruction is best for the size and assumed access.
+template <size_t bytes, AssumeAccess access>
+LIBC_INLINE void copy(void *dst, const void *src) {
+  if constexpr (access == AssumeAccess::kAligned) {
+    constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
+    memcpy_inline<bytes>(assume_aligned<alignment>(dst),
+                         assume_aligned<alignment>(src));
+  } else if constexpr (access == AssumeAccess::kUnknown) {
+    memcpy_inline<bytes>(dst, src);
+  } else {
+    static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess");
+  }
+}
 
-template <size_t bytes, Strategy strategy = AssumeUnaligned>
-LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
-  if constexpr (strategy == AssumeUnaligned) {
-    memcpy_inline<bytes>(assume_aligned<1>(dst), assume_aligned<1>(src));
-  } else if constexpr (strategy == AssumeWordAligned) {
-    static_assert(bytes >= kWordSize);
-    memcpy_inline<bytes>(assume_aligned<kWordSize>(dst),
-                         assume_aligned<kWordSize>(src));
-  } else if constexpr (strategy == ForceWordLdStChain) {
+template <size_t bytes, BlockOp block_op = BlockOp::kFull,
+          AssumeAccess access = AssumeAccess::kUnknown>
+LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
+  if constexpr (block_op == BlockOp::kFull) {
+    copy<bytes, access>(dst, src);
+  } else if constexpr (block_op == BlockOp::kByWord) {
     // We restrict loads/stores to 4 byte to prevent the use of load/store
-    // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
-    // fault (see notes below) and second, they use more registers which in turn
-    // adds push/pop instructions in the hot path.
+    // multiple (LDM, STM) and load/store double (LDRD, STRD).
     static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize));
     LIBC_LOOP_UNROLL
-    for (size_t i = 0; i < bytes / kWordSize; ++i) {
-      const size_t offset = i * kWordSize;
-      memcpy_inline<kWordSize>(dst + offset, src + offset);
+    for (size_t offset = 0; offset < bytes; offset += kWordSize) {
+      copy<kWordSize, access>(dst + offset, src + offset);
     }
+  } else {
+    static_assert(cpp::always_false<decltype(block_op)>, "Invalid BlockOp");
   }
   // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
   // into the load/store instructions.
@@ -72,39 +66,27 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
   src += bytes;
 }
 
-LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
-                                              const size_t size) {
+template <size_t bytes, BlockOp block_op, AssumeAccess access>
+LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) {
   LIBC_LOOP_NOUNROLL
-  for (size_t i = 0; i < size; ++i)
-    *dst++ = *src++;
+  for (size_t i = 0; i < size / bytes; ++i)
+    copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
+  size %= bytes;
 }
 
-template <size_t block_size, Strategy strategy>
-LIBC_INLINE void copy_blocks_and_update_args(Ptr &dst, CPtr &src,
-                                             size_t &size) {
+[[maybe_unused]] LIBC_INLINE void
+copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
   LIBC_LOOP_NOUNROLL
-  for (size_t i = 0; i < size / block_size; ++i)
-    copy_and_bump_pointers<block_size, strategy>(dst, src);
-  // Update `size` once at the end instead of once per iteration.
-  size %= block_size;
-}
-
-LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
-  return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
-                             cpp::bit_cast<uintptr_t>(b));
-}
-
-LIBC_INLINE auto misaligned(CPtr a) {
-  return distance_to_align_down<kWordSize>(a);
+  for (size_t i = 0; i < size; ++i)
+    *dst++ = *src++;
 }
 
 } // namespace
 
-// Implementation for Cortex-M0, M0+, M1.
-// Notes:
-// - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
-//   that also needs to return the `dst` ptr.
-// - These cores do not allow for unaligned loads/stores.
+// Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
+// loads/stores. It compiles down to 208 bytes when used through `memcpy` that
+// also needs to return the `dst` ptr.
+// Note:
 // - When `src` and `dst` are coaligned, we start by aligning them and perform
 //   bulk copies. We let the compiler know the pointers are aligned so it can
 //   use load/store multiple (LDM, STM). This significantly increase throughput
@@ -121,13 +103,20 @@ LIBC_INLINE auto misaligned(CPtr a) {
         copy_bytes_and_bump_pointers(dst, src, offset);
         size -= offset;
       }
+    constexpr AssumeAccess kAligned = AssumeAccess::kAligned;
     const auto src_alignment = distance_to_align_down<kWordSize>(src);
     if (src_alignment == 0)
       LIBC_ATTR_LIKELY {
         // Both `src` and `dst` are now word-aligned.
-        copy_blocks_and_update_args<64, AssumeWordAligned>(dst, src, size);
-        copy_blocks_and_update_args<16, AssumeWordAligned>(dst, src, size);
-        copy_blocks_and_update_args<4, AssumeWordAligned>(dst, src, size);
+        // We first copy by blocks of 64 bytes, the compiler will use 4
+        // load/store multiple (LDM, STM), each of 4 words. This requires more
+        // registers so additional push/pop are needed but the speedup is worth
+        // it.
+        consume_by_block<64, BlockOp::kFull, kAligned>(dst, src, size);
+        // Then we use blocks of 4 word load/store.
+        consume_by_block<16, BlockOp::kByWord, kAligned>(dst, src, size);
+        // Then we use word by word copy.
+        consume_by_block<4, BlockOp::kByWord, kAligned>(dst, src, size);
       }
     else {
       // `dst` is aligned but `src` is not.
@@ -138,7 +127,7 @@ LIBC_INLINE auto misaligned(CPtr a) {
             src_alignment == 2
                 ? load_aligned<uint32_t, uint16_t, uint16_t>(src)
                 : load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
-        memcpy_inline<kWordSize>(assume_aligned<kWordSize>(dst), &value);
+        copy<kWordSize, kAligned>(dst, &value);
         dst += kWordSize;
         src += kWordSize;
         size -= kWordSize;
@@ -151,17 +140,8 @@ LIBC_INLINE auto misaligned(CPtr a) {
 }
 
 // Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
-// support for unaligned loads and stores.
-// Notes:
-// - It compiles down to 266 bytes.
-// - `dst` and `src` are not `__restrict` to prevent the compiler from
-//   reordering loads/stores.
-// - We keep state variables to a strict minimum to keep everything in the free
-//   registers and prevent costly push / pop.
-// - If unaligned single loads/stores to normal memory are supported, unaligned
-//   accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
-//   STRD) instructions are generally not supported and will still fault so we
-//   make sure to restrict unrolling to word loads/stores.
+// support for unaligned loads and stores. It compiles down to 272 bytes when
+// used through `memcpy` that also needs to return the `dst` ptr.
 [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src,
                                                             size_t size) {
   if (misaligned(bitwise_or(src, dst)))
@@ -169,38 +149,59 @@ LIBC_INLINE auto misaligned(CPtr a) {
       if (size < 8)
         LIBC_ATTR_UNLIKELY {
           if (size & 1)
-            copy_and_bump_pointers<1>(dst, src);
+            copy_block_and_bump_pointers<1>(dst, src);
           if (size & 2)
-            copy_and_bump_pointers<2>(dst, src);
+            copy_block_and_bump_pointers<2>(dst, src);
           if (size & 4)
-            copy_and_bump_pointers<4>(dst, src);
+            copy_block_and_bump_pointers<4>(dst, src);
           return;
         }
       if (misaligned(src))
         LIBC_ATTR_UNLIKELY {
           const size_t offset = distance_to_align_up<kWordSize>(dst);
           if (offset & 1)
-            copy_and_bump_pointers<1>(dst, src);
+            copy_block_and_bump_pointers<1>(dst, src);
           if (offset & 2)
-            copy_and_bump_pointers<2>(dst, src);
+            copy_block_and_bump_pointers<2>(dst, src);
           size -= offset;
         }
     }
-  copy_blocks_and_update_args<64, ForceWordLdStChain>(dst, src, size);
-  copy_blocks_and_update_args<16, ForceWordLdStChain>(dst, src, size);
-  copy_blocks_and_update_args<4, AssumeUnaligned>(dst, src, size);
+  // `dst` and `src` are not necessarily both aligned at that point but this
+  // implementation assumes hardware support for unaligned loads and stores so
+  // it is still fast to perform unrolled word by word copy. Note that wider
+  // accesses through the use of load/store multiple (LDM, STM) and load/store
+  // double (LDRD, STRD) instructions are generally not supported and can fault.
+  // By forcing decomposition of 64 bytes copy into word by word copy, the
+  // compiler uses a load to prefetch the next cache line:
+  //   ldr  r3, [r1, #64]!  <- prefetch next cache line
+  //   str  r3, [r0]
+  //   ldr  r3, [r1, #0x4]
+  //   str  r3, [r0, #0x4]
+  //   ...
+  //   ldr  r3, [r1, #0x3c]
+  //   str  r3, [r0, #0x3c]
+  // This is a bit detrimental for sizes between 64 and 256 (less than 10%
+  // penalty) but the prefetch yields better throughput for larger copies.
+  constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown;
+  consume_by_block<64, BlockOp::kByWord, kUnknown>(dst, src, size);
+  consume_by_block<16, BlockOp::kByWord, kUnknown>(dst, src, size);
+  consume_by_block<4, BlockOp::kByWord, kUnknown>(dst, src, size);
   if (size & 1)
-    copy_and_bump_pointers<1>(dst, src);
+    copy_block_and_bump_pointers<1>(dst, src);
   if (size & 2)
-    LIBC_ATTR_UNLIKELY
-  copy_and_bump_pointers<2>(dst, src);
+    copy_block_and_bump_pointers<2>(dst, src);
 }
 
-[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_,
-                                                    const void *__restrict src_,
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src,
                                                     size_t size) {
-  Ptr dst = cpp::bit_cast<Ptr>(dst_);
-  CPtr src = cpp::bit_cast<CPtr>(src_);
+  // The compiler performs alias analysis and is able to prove that `dst` and
+  // `src` do not alias by propagating the `__restrict` keyword from the
+  // `memcpy` prototype. This allows the compiler to merge consecutive
+  // load/store (LDR, STR) instructions generated in
+  // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
+  // double (LDRD, STRD) instructions, this is is undesirable so we prevent the
+  // compiler from inferring `__restrict` with the following line.
+  asm volatile("" : "+r"(dst), "+r"(src));
 #ifdef __ARM_FEATURE_UNALIGNED
   return inline_memcpy_arm_mid_end(dst, src, size);
 #else
@@ -210,8 +211,4 @@ LIBC_INLINE auto misaligned(CPtr a) {
 
 } // namespace LIBC_NAMESPACE_DECL
 
-// Cleanup local macros
-#undef LIBC_ATTR_LIKELY
-#undef LIBC_ATTR_UNLIKELY
-
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
diff --git a/libc/src/string/memory_utils/arm/inline_memset.h b/libc/src/string/memory_utils/arm/inline_memset.h
new file mode 100644
index 0000000000000..a7ef9cc7df916
--- /dev/null
+++ b/libc/src/string/memory_utils/arm/inline_memset.h
@@ -0,0 +1,156 @@
+//===-- Memset implementation for arm ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// The functions defined in this file give approximate code size. These sizes
+// assume the following configuration options:
+// - LIBC_CONF_KEEP_FRAME_POINTER = false
+// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
+// - LIBC_ADD_NULL_CHECKS = false
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H
+
+#include "src/__support/CPP/type_traits.h"     // always_false
+#include "src/__support/macros/attributes.h"   // LIBC_INLINE
+#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
+#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
+#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
+
+#include <stddef.h> // size_t
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+template <size_t bytes, AssumeAccess access>
+LIBC_INLINE void set(void *dst, uint32_t value) {
+  static_assert(bytes == 1 || bytes == 2 || bytes == 4);
+  if constexpr (access == AssumeAccess::kAligned) {
+    constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
+    memcpy_inline<bytes>(assume_aligned<alignment>(dst), &value);
+  } else if constexpr (access == AssumeAccess::kUnknown) {
+    memcpy_inline<bytes>(dst, &value);
+  } else {
+    static_assert(cpp::always_false<decltype(access)>, "Invalid AssumeAccess");
+  }
+}
+
+template <size_t bytes, AssumeAccess access = AssumeAccess::kUnknown>
+LIBC_INLINE void set_block_and_bump_pointers(Ptr &dst, uint32_t value) {
+  if constexpr (bytes <= kWordSize) {
+    set<bytes, access>(dst, value);
+  } else {
+    static_assert(bytes % kWordSize == 0 && bytes >= kWordSize);
+    LIBC_LOOP_UNROLL
+    for (size_t offset = 0; offset < bytes; offset += kWordSize) {
+      set<kWordSize, access>(dst + offset, value);
+    }
+  }
+  // In the 1, 2, 4 byte set case, the compiler can fold pointer offsetting
+  // into the store instructions.
+  // e.g.,
+  // strb  r3, [r0], #1
+  dst += bytes;
+}
+
+template <size_t bytes, AssumeAccess access>
+LIBC_INLINE void consume_by_block(Ptr &dst, uint32_t value, size_t &size) {
+  LIBC_LOOP_NOUNROLL
+  for (size_t i = 0; i < size / bytes; ++i)
+    set_block_and_bump_pointers<bytes, access>(dst, value);
+  size %= bytes;
+}
+
+[[maybe_unused]] LIBC_INLINE void
+set_bytes_and_bump_pointers(Ptr &dst, uint32_t value, size_t size) {
+  LIBC_LOOP_NOUNROLL
+  for (size_t i = 0; i < size; ++i) {
+    set<1, AssumeAccess::kUnknown>(dst++, value);
+  }
+}
+
+} // namespace
+
+// Implementation for Cortex-M0, M0+, M1. It compiles down to 140 bytes when
+// used through `memset` that also needs to return the `dst` ptr. These cores do
+// not allow unaligned stores so all accesses are aligned.
+[[maybe_unused]] LIBC_INLINE void
+inline_memset_arm_low_end(Ptr dst, uint8_t value, size_t size) {
+  if (size >= 8)
+    LIBC_ATTR_LIKELY {
+      // Align `dst` to word boundary.
+      if (const size_t offset = distance_to_align_up<kWordSize>(dst))
+        LIBC_ATTR_UNLIKELY {
+          set_bytes_and_bump_pointers(dst, value, offset);
+          size -= offset;
+        }
+      const uint32_t value32 = value * 0x01010101U; // splat value in each byte
+      consume_by_block<64, AssumeAccess::kAligned>(dst, value32, size);
+      consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size);
+      consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size);
+    }
+  set_bytes_and_bump_pointers(dst, value, size);
+}
+
+// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
+// support for unaligned loads and stores. It compiles down to 186 bytes when
+// used through `memset` that also needs to return the `dst` ptr.
+[[maybe_unused]] LIBC_INLINE void
+inline_memset_arm_mid_end(Ptr dst, uint8_t value, size_t size) {
+  const uint32_t value32 = value * 0x01010101U; // splat value in each byte
+  if (misaligned(dst))
+    LIBC_ATTR_UNLIKELY {
+      if (size < 8)
+        LIBC_ATTR_UNLIKELY {
+          if (size & 1)
+            set_block_and_bump_pointers<1>(dst, value32);
+          if (size & 2)
+            set_block_and_bump_pointers<2>(dst, value32);
+          if (size & 4)
+            set_block_and_bump_pointers<4>(dst, value32);
+          return;
+        }
+      const size_t offset = distance_to_align_up<kWordSize>(dst);
+      if (offset & 1)
+        set_block_and_bump_pointers<1>(dst, value32);
+      if (offset & 2)
+        set_block_and_bump_pointers<2>(dst, value32);
+      size -= offset;
+    }
+  // If we tell the compiler that the stores are aligned it will generate 8 x
+  // STRD instructions. By not specifying alignment, the compiler conservatively
+  // uses 16 x STR.W and is able to use the first one to prefetch the
+  // destination in advance leading to better asymptotic performances.
+  //   str      r12, [r3, #64]!   <- prefetch next cache line
+  //   str.w    r12, [r3, #0x4]
+  //   str.w    r12, [r3, #0x8]
+  //   ...
+  //   str.w    r12, [r3, #0x38]
+  //   str.w    r12, [r3, #0x3c]
+  consume_by_block<64, AssumeAccess::kUnknown>(dst, value32, size);
+  // Prefetching does not matter anymore at this scale so using STRD yields
+  // better results.
+  consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size);
+  consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size);
+  if (size & 1)
+    set_block_and_bump_pointers<1>(dst, value32);
+  if (size & 2)
+    LIBC_ATTR_UNLIKELY
+  set_block_and_bump_pointers<2>(dst, value32);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memset_arm_dispatch(Ptr dst, uint8_t value, size_t size) {
+#ifdef __ARM_FEATURE_UNALIGNED
+  return inline_memset_arm_mid_end(dst, value, size);
+#else
+  return inline_memset_arm_low_end(dst, value, size);
+#endif
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
diff --git a/libc/src/string/memory_utils/inline_memset.h b/libc/src/string/memory_utils/inline_memset.h
index fd9c29ea4410a..e41bdb626d60e 100644
--- a/libc/src/string/memory_utils/inline_memset.h
+++ b/libc/src/string/memory_utils/inline_memset.h
@@ -18,6 +18,9 @@
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 #include "src/string/memory_utils/x86_64/inline_memset.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_x86
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
+#include "src/string/memory_utils/arm/inline_memset.h"
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_arm_dispatch
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #include "src/string/memory_utils/aarch64/inline_memset.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_aarch64_dispatch
@@ -34,7 +37,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LIBC_INLINE static void inline_memset(void *dst, uint8_t value, size_t count) {
+[[gnu::flatten]] LIBC_INLINE void inline_memset(void *dst, uint8_t value,
+                                                size_t count) {
   LIBC_SRC_STRING_MEMORY_UTILS_MEMSET(reinterpret_cast<Ptr>(dst), value, count);
 }
 
diff --git a/libc/src/wctype/CMakeLists.txt b/libc/src/wctype/CMakeLists.txt
new file mode 100644
index 0000000000000..3ac5eaef8ed8b
--- /dev/null
+++ b/libc/src/wctype/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_entrypoint_object(
+  iswalpha
+  SRCS
+    iswalpha.cpp
+  HDRS
+    iswalpha.h
+  DEPENDS
+    libc.src.__support.wctype_utils    
+)
diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp
new file mode 100644
index 0000000000000..e18f29370fbd0
--- /dev/null
+++ b/libc/src/wctype/iswalpha.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of iswalpha ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wctype/iswalpha.h"
+#include "src/__support/common.h"
+#include "src/__support/wctype_utils.h"
+
+#include "hdr/types/wint_t.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(bool, iswalpha, (wint_t c)) { return internal::iswalpha(c); }
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wctype/iswalpha.h b/libc/src/wctype/iswalpha.h
new file mode 100644
index 0000000000000..681fc6ba79a54
--- /dev/null
+++ b/libc/src/wctype/iswalpha.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for iswalpha ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H
+#define LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H
+
+#include "hdr/types/wint_t.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+bool iswalpha(wint_t c);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index 6dca47b5343e6..b3eba43582074 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -70,6 +70,7 @@ add_subdirectory(stdlib)
 add_subdirectory(string)
 add_subdirectory(strings)
 add_subdirectory(wchar)
+add_subdirectory(wctype)
 add_subdirectory(time)
 add_subdirectory(unistd)
 
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024b..f0727451736f9 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,18 @@ add_libc_test(
   DEPENDS
     libc.src.__support.wchar.character_converter
 )
+
+
+add_libc_test(
+  string_converter_test
+  SUITE
+    libc-support-tests
+  SRCS
+    string_converter_test.cpp
+  DEPENDS
+    libc.src.__support.wchar.string_converter
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.error_or
+    libc.hdr.errno_macros
+    libc.hdr.types.char32_t
+)
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
new file mode 100644
index 0000000000000..d514df9317852
--- /dev/null
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -0,0 +1,423 @@
+//===-- Unittests for StringConverter class -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/properties/os.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/string_converter.h"
+#include "test/UnitTest/Test.h"
+
+// TODO: add support for 16-bit widechars to StringConverter to remove this
+// macro
+#ifdef LIBC_TARGET_OS_IS_WINDOWS
+TEST(LlvmLibcStringConverterTest, Windows) {
+  // pass on windows for now
+}
+
+#else
+
+TEST(LlvmLibcStringConverterTest, UTF8To32) {
+  // first 4 bytes are clown emoji (🤡)
+  // next 3 bytes are sigma symbol (∑)
+  // next 2 bytes are y with diaeresis (ÿ)
+  // last byte is the letter A
+  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
+
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 7);
+
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xff);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 9);
+
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x41);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 10);
+
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
+
+  res = sc.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(res.error(), -1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8) {
+  // clown emoji, sigma symbol, y with diaeresis, letter A
+  const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
+                         static_cast<wchar_t>(0x2211),
+                         static_cast<wchar_t>(0xff), static_cast<wchar_t>(0x41),
+                         static_cast<wchar_t>(0x0)};
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  // end of clown emoji, sigma symbol begins
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x88);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x91);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
+
+  // end of sigma symbol, y with diaeresis begins
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
+
+  // end of y with diaeresis, letter A begins
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x41);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+  // null byte
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
+
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(res.error(), -1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
+  const wchar_t src[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0x2211),
+      static_cast<wchar_t>(0x0)}; // clown emoji, sigma symbol
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  // can only read 1 character from source string, so error on next pop
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(res.error(), -1);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
+  // first 4 bytes are clown emoji, then next 3 are sigma symbol
+  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 5);
+
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+  res = sc.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
+  const wchar_t src[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
+      static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
+  // first 4 bytes are clown emoji (🤡)
+  // next 3 form an invalid character
+  const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
+
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+  res = sc.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+}
+
+TEST(LlvmLibcStringConverterTest, InvalidCharacterOutsideBounds) {
+  // if an invalid character exists in the source string but we don't have space
+  // to write it, we should return a "stop converting" error rather than an
+  // invalid character error
+
+  // first 4 bytes are clown emoji (🤡)
+  // next 3 form an invalid character
+  const char *src1 = "\xF0\x9F\xA4\xA1\x90\x88\x30";
+  LIBC_NAMESPACE::internal::mbstate ps1;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
+      reinterpret_cast<const char8_t *>(src1), &ps1, 1);
+
+  auto res1 = sc1.popUTF32();
+  ASSERT_TRUE(res1.has_value());
+  ASSERT_EQ(static_cast<int>(res1.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
+
+  res1 = sc1.popUTF32();
+  ASSERT_FALSE(res1.has_value());
+  // no space to write error NOT invalid character error (EILSEQ)
+  ASSERT_EQ(static_cast<int>(res1.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
+
+  const wchar_t src2[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
+      static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
+  LIBC_NAMESPACE::internal::mbstate ps2;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
+      reinterpret_cast<const char32_t *>(src2), &ps2, 4);
+
+  auto res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_TRUE(res2.has_value());
+  ASSERT_EQ(static_cast<int>(res2.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res2 = sc2.popUTF8();
+  ASSERT_FALSE(res2.has_value());
+  // no space to write error NOT invalid character error (EILSEQ)
+  ASSERT_EQ(static_cast<int>(res2.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+}
+
+TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
+  /*
+  We do NOT test partially popping a character and expecting the next
+  StringConverter to continue where we left off. This is not expected to work
+  and considered invalid.
+  */
+  const wchar_t src[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xff),
+      static_cast<wchar_t>(0x0)}; // clown emoji, y with diaeresis (ÿ)
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
+
+  auto res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
+
+  res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
+
+  res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
+
+  res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
+
+  // sc2 should pick up where sc1 left off and continue the conversion
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
+      reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), &state,
+      SIZE_MAX, 1);
+
+  res = sc2.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+  res = sc2.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+}
+
+TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
+  const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 2);
+
+  auto res = sc1.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 2);
+
+  // sc2 should pick up where sc1 left off and continue the conversion
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
+      reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), &state,
+      SIZE_MAX, 3);
+
+  res = sc2.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 2);
+
+  res = sc2.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 3);
+}
+
+TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
+  const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), &state, 1);
+
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+  res = sc.popUTF32(); // no space to pop this into
+  ASSERT_FALSE(res.has_value());
+}
+
+TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
+  const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
+                         static_cast<wchar_t>(0x1f921)}; // 2 clown emojis
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), &state, 5);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+}
+
+#endif
diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp
index cb88bfcade0dc..5c30fb7c8718f 100644
--- a/libc/test/src/math/cospif_test.cpp
+++ b/libc/test/src/math/cospif_test.cpp
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcCospifTest, SmallValues) {
                                  LIBC_NAMESPACE::cospif(x), 0.5);
 }
 
-// SDCOMP-26094: check sinfpi in the cases for which the range reducer
+// SDCOMP-26094: check cospif in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcCospifTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp
index 01197b835433f..ff1181e0c6fd0 100644
--- a/libc/test/src/math/explogxf_test.cpp
+++ b/libc/test/src/math/explogxf_test.cpp
@@ -43,11 +43,6 @@ TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
              def_prec);
 }
 
-TEST_F(LlvmLibcExplogfTest, Log2InFloatRange) {
-  CHECK_DATA(0.0f, inf, mpfr::Operation::Log2, LIBC_NAMESPACE::log2_eval,
-             f_normal, def_count, def_prec);
-}
-
 TEST_F(LlvmLibcExplogfTest, LogInFloatRange) {
   CHECK_DATA(0.0f, inf, mpfr::Operation::Log, LIBC_NAMESPACE::log_eval,
              f_normal, def_count, def_prec);
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index ad2155f329cd9..4aac1fabfbd62 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -164,7 +164,7 @@ TEST_F(LlvmLibcSinCosfTest, SpecialValues) {
   }
 }
 
-// SDCOMP-26094: check sinf in the cases for which the range reducer
+// SDCOMP-26094: check sincosf in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinCosfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp
index 986c676761f0e..94e3dbc4f07d4 100644
--- a/libc/test/src/math/sinpif_test.cpp
+++ b/libc/test/src/math/sinpif_test.cpp
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcSinpifTest, SmallValues) {
                                  LIBC_NAMESPACE::sinpif(x), 0.5);
 }
 
-// SDCOMP-26094: check sinfpi in the cases for which the range reducer
+// SDCOMP-26094: check sinpif in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinpifTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/wctype/CMakeLists.txt b/libc/test/src/wctype/CMakeLists.txt
new file mode 100644
index 0000000000000..5459cdb4a9b71
--- /dev/null
+++ b/libc/test/src/wctype/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc_wctype_unittests)
+
+add_libc_test(
+  iswalpha_test
+  SUITE
+    libc_wctype_unittests
+  SRCS
+    iswalpha_test.cpp
+  DEPENDS
+    libc.src.wctype.iswalpha
+)
diff --git a/libc/test/src/wctype/iswalpha_test.cpp b/libc/test/src/wctype/iswalpha_test.cpp
new file mode 100644
index 0000000000000..f3f75f4dc7aa5
--- /dev/null
+++ b/libc/test/src/wctype/iswalpha_test.cpp
@@ -0,0 +1,54 @@
+//===-- Unittests for iswalpha --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/span.h"
+#include "src/wctype/iswalpha.h"
+
+#include "test/UnitTest/LibcTest.h"
+#include "test/UnitTest/Test.h"
+
+namespace {
+
+// TODO: Merge the wctype tests using this framework.
+constexpr char WALPHA_ARRAY[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
+} // namespace
+
+TEST(LlvmLibciswalpha, SimpleTest) {
+  EXPECT_TRUE(LIBC_NAMESPACE::iswalpha('a'));
+  EXPECT_TRUE(LIBC_NAMESPACE::iswalpha('B'));
+
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('3'));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(' '));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('?'));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('\0'));
+  EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(-1));
+}
+
+TEST(LlvmLibciswalpha, DefaultLocale) {
+  // Loops through all characters, verifying that letters return
+  // true and everything else returns false.
+  for (int ch = -255; ch < 255; ++ch) {
+    if (in_span(ch, WALPHA_ARRAY))
+      EXPECT_TRUE(LIBC_NAMESPACE::iswalpha(ch));
+    else
+      EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(ch));
+  }
+}
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index e2871d1b01a16..e4e9a74639b17 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -164,34 +164,14 @@ endif()
 
 list( SORT LIBCLC_TARGETS_TO_BUILD )
 
-# Construct LLVM version define
-set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
-
 # This needs to be set before any target that needs it
 # We need to use LLVM_INCLUDE_DIRS here, because if we are linking to an
 # llvm build directory, this includes $src/llvm/include which is where all the
 # headers are not $build/include/ which is what LLVM_INCLUDE_DIR is set to.
 include_directories( ${LLVM_INCLUDE_DIRS} )
 
-# Setup prepare_builtins tools
-set(LLVM_LINK_COMPONENTS
-  BitReader
-  BitWriter
-  Core
-  IRReader
-  Support
-)
-if( LIBCLC_STANDALONE_BUILD )
-  add_llvm_executable( prepare_builtins utils/prepare-builtins.cpp )
-  set( prepare_builtins_exe prepare_builtins )
-  set( prepare_builtins_target prepare_builtins )
-else()
-  add_llvm_utility( prepare_builtins utils/prepare-builtins.cpp )
-  setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
-endif()
-target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
-# These were not properly reported in early LLVM and we don't need them
-target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
+# Configure prepare_builtins
+add_subdirectory( utils )
 
 # Setup arch devices
 set( r600--_devices cedar cypress barts cayman )
@@ -425,6 +405,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       -I${CMAKE_CURRENT_SOURCE_DIR}/clc/include
       # Error on undefined macros
       -Werror=undef
+      -fdiscard-value-names
     )
 
     if( NOT "${cpu}" STREQUAL "" )
diff --git a/libclc/clc/include/clc/atomic/atomic_decl.inc b/libclc/clc/include/clc/atomic/atomic_decl.inc
new file mode 100644
index 0000000000000..b790a94c7d288
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/atomic_decl.inc
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// MemoryOrder is memory order supported by Clang __scoped_atomic* builtins.
+// MemoryScope is memory scope supported by Clang __scoped_atomic* builtins.
+
+#ifdef __CLC_SCALAR
+#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
+      int MemoryScope);
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \
+                                        __CLC_GENTYPE Value, int MemoryOrder,  \
+                                        int MemoryScope);
+#elif defined(__CLC_COMPARE_EXCHANGE)
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator,         \
+      __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal,       \
+      int MemoryScope);
+#else
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE)                                        \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
+      int MemoryOrder, int MemoryScope);
+#endif
+
+__CLC_DECLARE_ATOMIC(global)
+__CLC_DECLARE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DECLARE_ATOMIC()
+#endif
+
+#undef __CLC_DECLARE_ATOMIC
+
+#endif // defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h
new file mode 100644
index 0000000000000..ae7918ac32e43
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_compare_exchange
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_COMPARE_EXCHANGE
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_dec.h b/libclc/clc/include/clc/atomic/clc_atomic_dec.h
new file mode 100644
index 0000000000000..ada36ba3ff9b3
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_dec.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_DEC_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_DEC_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_dec
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_DEC_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h
new file mode 100644
index 0000000000000..7e626d4a8830b
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_exchange
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h
new file mode 100644
index 0000000000000..ad0c2eb4607a7
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_add
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h
new file mode 100644
index 0000000000000..80810c38cbbb8
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_and
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h
new file mode 100644
index 0000000000000..56f511922e5c7
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_max
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h
new file mode 100644
index 0000000000000..f17408d28a35d
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_min
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h
new file mode 100644
index 0000000000000..b82069e6f960e
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_or
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h
new file mode 100644
index 0000000000000..6cfd224629d60
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_sub
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h
new file mode 100644
index 0000000000000..b007b47a9369d
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_fetch_xor
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_inc.h b/libclc/clc/include/clc/atomic/clc_atomic_inc.h
new file mode 100644
index 0000000000000..3ddef4a8bf355
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_inc.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_INC_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_INC_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_inc
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_INC_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_load.h b/libclc/clc/include/clc/atomic/clc_atomic_load.h
new file mode 100644
index 0000000000000..a4899b34b88a1
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_load.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_load
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_store.h b/libclc/clc/include/clc/atomic/clc_atomic_store.h
new file mode 100644
index 0000000000000..6baf0eb7ea32b
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_store.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_STORE_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_STORE_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_atomic_store
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_RETURN_VOID
+#undef FUNCTION
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_STORE_H__
diff --git a/libclc/clc/include/clc/integer/clc_bit_reverse.h b/libclc/clc/include/clc/integer/clc_bit_reverse.h
new file mode 100644
index 0000000000000..c945e326c74fa
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bit_reverse.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BIT_REVERSE_H__
+#define __CLC_INTEGER_CLC_BIT_REVERSE_H__
+
+#define FUNCTION __clc_bit_reverse
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_INTEGER_CLC_BIT_REVERSE_H__
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc
new file mode 100644
index 0000000000000..c93eff08de0bc
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc
@@ -0,0 +1,10 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE base, uint offset,
+                                           uint count);
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h
new file mode 100644
index 0000000000000..9c2e047b8be00
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+#define __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_bitfield_extract_signed
+#define __RETTYPE __CLC_S_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h
new file mode 100644
index 0000000000000..95305a3027e5d
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+#define __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_bitfield_extract_unsigned
+#define __RETTYPE __CLC_U_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.h b/libclc/clc/include/clc/integer/clc_bitfield_insert.h
new file mode 100644
index 0000000000000..f4d36b2ad2d2e
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.h
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_INTEGER_CLC_BITFIELD_INSERT_H__
+#define __CLC_INTEGER_CLC_BITFIELD_INSERT_H__
+
+#include <clc/internal/clc.h>
+
+#define FUNCTION __clc_bitfield_insert
+#define __CLC_BODY <clc/integer/clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // __CLC_INTEGER_CLC_BITFIELD_INSERT_H__
diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.inc b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc
new file mode 100644
index 0000000000000..22f58bdc09830
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc
@@ -0,0 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE base,
+                                               __CLC_GENTYPE insert,
+                                               uint offset, uint count);
diff --git a/libclc/clc/include/clc/relational/binary_decl.inc b/libclc/clc/include/clc/relational/binary_decl.inc
index bcdf5238b8f58..dc8ec9db7b7da 100644
--- a/libclc/clc/include/clc/relational/binary_decl.inc
+++ b/libclc/clc/include/clc/relational/binary_decl.inc
@@ -6,4 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DECL __CLC_INTN FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b);
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b);
+
+#undef __RETTYPE
diff --git a/libclc/clc/include/clc/relational/clc_isfinite.h b/libclc/clc/include/clc/relational/clc_isfinite.h
index 5e71ec7a0640a..444d733039819 100644
--- a/libclc/clc/include/clc/relational/clc_isfinite.h
+++ b/libclc/clc/include/clc/relational/clc_isfinite.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isfinite
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isgreater.h b/libclc/clc/include/clc/relational/clc_isgreater.h
index e2e6911a80cdd..88de46854961d 100644
--- a/libclc/clc/include/clc/relational/clc_isgreater.h
+++ b/libclc/clc/include/clc/relational/clc_isgreater.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isgreaterequal.h b/libclc/clc/include/clc/relational/clc_isgreaterequal.h
index 3fe8835aff9d5..42308036f102f 100644
--- a/libclc/clc/include/clc/relational/clc_isgreaterequal.h
+++ b/libclc/clc/include/clc/relational/clc_isgreaterequal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isgreaterequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isless.h b/libclc/clc/include/clc/relational/clc_isless.h
index 01384cf6fa4a0..6fdc6c54947c0 100644
--- a/libclc/clc/include/clc/relational/clc_isless.h
+++ b/libclc/clc/include/clc/relational/clc_isless.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isless
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_islessequal.h b/libclc/clc/include/clc/relational/clc_islessequal.h
index a4b77a451b248..e592287b23099 100644
--- a/libclc/clc/include/clc/relational/clc_islessequal.h
+++ b/libclc/clc/include/clc/relational/clc_islessequal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_islessequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_islessgreater.h b/libclc/clc/include/clc/relational/clc_islessgreater.h
index 9fb6d641bfa14..a2f10707a677d 100644
--- a/libclc/clc/include/clc/relational/clc_islessgreater.h
+++ b/libclc/clc/include/clc/relational/clc_islessgreater.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_islessgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isnormal.h b/libclc/clc/include/clc/relational/clc_isnormal.h
index d580fed5a7395..2281bc4245d03 100644
--- a/libclc/clc/include/clc/relational/clc_isnormal.h
+++ b/libclc/clc/include/clc/relational/clc_isnormal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isnormal
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isnotequal.h b/libclc/clc/include/clc/relational/clc_isnotequal.h
index 16982fc3c5aaa..c2640fc0899a6 100644
--- a/libclc/clc/include/clc/relational/clc_isnotequal.h
+++ b/libclc/clc/include/clc/relational/clc_isnotequal.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isnotequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isordered.h b/libclc/clc/include/clc/relational/clc_isordered.h
index 7ba26662105fc..cb9be31311575 100644
--- a/libclc/clc/include/clc/relational/clc_isordered.h
+++ b/libclc/clc/include/clc/relational/clc_isordered.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_isunordered.h b/libclc/clc/include/clc/relational/clc_isunordered.h
index eac158d245191..36d314ff0e1be 100644
--- a/libclc/clc/include/clc/relational/clc_isunordered.h
+++ b/libclc/clc/include/clc/relational/clc_isunordered.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_isunordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/clc_signbit.h b/libclc/clc/include/clc/relational/clc_signbit.h
index 892263a09e99c..9e423ab448953 100644
--- a/libclc/clc/include/clc/relational/clc_signbit.h
+++ b/libclc/clc/include/clc/relational/clc_signbit.h
@@ -12,7 +12,7 @@
 #define FUNCTION __clc_signbit
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/clc/include/clc/relational/floatn.inc b/libclc/clc/include/clc/relational/floatn.inc
deleted file mode 100644
index 263937f6eef6f..0000000000000
--- a/libclc/clc/include/clc/relational/floatn.inc
+++ /dev/null
@@ -1,132 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <clc/clcfunc.h>
-#include <clc/clctypes.h>
-
-#define __CLC_FLOATN float
-#define __CLC_INTN int
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float2
-#define __CLC_INTN int2
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float3
-#define __CLC_INTN int3
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float4
-#define __CLC_INTN int4
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float8
-#define __CLC_INTN int8
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN float16
-#define __CLC_INTN int16
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#undef __CLC_FLOAT
-#undef __CLC_INT
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-#define __CLC_FLOATN double
-#define __CLC_INTN int
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double2
-#define __CLC_INTN long2
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double3
-#define __CLC_INTN long3
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double4
-#define __CLC_INTN long4
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double8
-#define __CLC_INTN long8
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN double16
-#define __CLC_INTN long16
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#endif
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define __CLC_FLOATN half
-#define __CLC_INTN int
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half2
-#define __CLC_INTN short2
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half3
-#define __CLC_INTN short3
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half4
-#define __CLC_INTN short4
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half8
-#define __CLC_INTN short8
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#define __CLC_FLOATN half16
-#define __CLC_INTN short16
-#include __CLC_BODY
-#undef __CLC_INTN
-#undef __CLC_FLOATN
-
-#endif
-
-#undef __CLC_BODY
diff --git a/libclc/clc/include/clc/relational/unary_decl.inc b/libclc/clc/include/clc/relational/unary_decl.inc
index b9fb36c905469..cc3f2d065529b 100644
--- a/libclc/clc/include/clc/relational/unary_decl.inc
+++ b/libclc/clc/include/clc/relational/unary_decl.inc
@@ -6,4 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-_CLC_OVERLOAD _CLC_DECL __CLC_INTN FUNCTION(__CLC_FLOATN x);
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE x);
+
+#undef __RETTYPE
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index bf8736a726315..ee4f771799e8e 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -1,4 +1,17 @@
 async/clc_prefetch.cl
+atomic/clc_atomic_compare_exchange.cl
+atomic/clc_atomic_dec.cl
+atomic/clc_atomic_exchange.cl
+atomic/clc_atomic_fetch_add.cl
+atomic/clc_atomic_fetch_and.cl
+atomic/clc_atomic_fetch_max.cl
+atomic/clc_atomic_fetch_min.cl
+atomic/clc_atomic_fetch_or.cl
+atomic/clc_atomic_fetch_sub.cl
+atomic/clc_atomic_fetch_xor.cl
+atomic/clc_atomic_inc.cl
+atomic/clc_atomic_load.cl
+atomic/clc_atomic_store.cl
 common/clc_degrees.cl
 common/clc_radians.cl
 common/clc_sign.cl
@@ -15,6 +28,10 @@ geometric/clc_normalize.cl
 integer/clc_abs.cl
 integer/clc_abs_diff.cl
 integer/clc_add_sat.cl
+integer/clc_bitfield_extract_signed.cl
+integer/clc_bitfield_extract_unsigned.cl
+integer/clc_bitfield_insert.cl
+integer/clc_bit_reverse.cl
 integer/clc_clz.cl
 integer/clc_ctz.cl
 integer/clc_hadd.cl
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl
new file mode 100644
index 0000000000000..796dedcef3857
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_compare_exchange.h>
+
+#define __CLC_BODY <clc_atomic_compare_exchange.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_compare_exchange.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc
new file mode 100644
index 0000000000000..32ff9b45b769e
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__SPIR32__) || defined(CLC_NVPTX)
+#if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) ||                           \
+    (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32))
+#define __CLC_HAS_ATOMIC
+#endif
+#else // defined(__SPIR32__) || defined(CLC_NVPTX)
+#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+#define __CLC_HAS_ATOMIC
+#endif
+#endif // defined(__SPIR32__) || defined(CLC_NVPTX)
+
+#ifdef __CLC_HAS_ATOMIC
+
+#ifdef __CLC_FPSIZE
+
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange(         \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator,         \
+      __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal,       \
+      int MemoryScope) {                                                       \
+    __CLC_U_GENTYPE Comp = __CLC_AS_U_GENTYPE(Comparator);                     \
+    __scoped_atomic_compare_exchange_n(                                        \
+        (ADDRSPACE __CLC_U_GENTYPE *)Ptr, &Comp, __CLC_AS_U_GENTYPE(Value),    \
+        false, MemoryOrderEqual, MemoryOrderUnequal, MemoryScope);             \
+    return __CLC_AS_GENTYPE(Comp);                                             \
+  }
+
+#else
+
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange(         \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator,         \
+      __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal,       \
+      int MemoryScope) {                                                       \
+    __scoped_atomic_compare_exchange_n(Ptr, &Comparator, Value, false,         \
+                                       MemoryOrderEqual, MemoryOrderUnequal,   \
+                                       MemoryScope);                           \
+    return Comparator;                                                         \
+  }
+
+#endif // __CLC_FPSIZE
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // __CLC_HAS_ATOMIC
+#undef __CLC_HAS_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl
new file mode 100644
index 0000000000000..f35a9624fd013
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_dec.h>
+
+#define FUNCTION __clc_atomic_dec
+#define __IMPL_FUNCTION __scoped_atomic_fetch_add
+#define __CLC_INC_DEC
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
new file mode 100644
index 0000000000000..2c45f49f60848
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__SPIR32__) || defined(CLC_NVPTX)
+#if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) ||                           \
+    (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32))
+#define __CLC_HAS_ATOMIC
+#endif
+#else // defined(__SPIR32__) || defined(CLC_NVPTX)
+#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32)
+#define __CLC_HAS_ATOMIC
+#endif
+#endif // defined(__SPIR32__) || defined(CLC_NVPTX)
+
+#ifdef __CLC_HAS_ATOMIC
+
+#ifndef __CLC_PTR_CASTTYPE
+#define __CLC_PTR_CASTTYPE __CLC_GENTYPE
+#endif
+
+#ifndef __CLC_AS_RETTYPE
+#define __CLC_AS_RETTYPE(x) x
+#endif
+
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
+      int MemoryScope) {                                                       \
+    return __CLC_AS_RETTYPE(__IMPL_FUNCTION(                                   \
+        (ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, MemoryOrder, MemoryScope));       \
+  }
+#elif defined(__CLC_INC_DEC)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
+      int MemoryScope) {                                                       \
+    return __CLC_AS_RETTYPE(                                                   \
+        __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, (__CLC_GENTYPE)1, \
+                        MemoryOrder, MemoryScope));                            \
+  }
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \
+                                        __CLC_GENTYPE Value, int MemoryOrder,  \
+                                        int MemoryScope) {                     \
+    __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, MemoryOrder,   \
+                    MemoryScope);                                              \
+  }
+#else
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
+      int MemoryOrder, int MemoryScope) {                                      \
+    return __CLC_AS_RETTYPE(                                                   \
+        __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value,            \
+                        MemoryOrder, MemoryScope));                            \
+  }
+#endif
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // __CLC_HAS_ATOMIC
+#undef __CLC_HAS_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
new file mode 100644
index 0000000000000..52fd11afed6a2
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_exchange.h>
+
+#define FUNCTION __clc_atomic_exchange
+#define __IMPL_FUNCTION __scoped_atomic_exchange_n
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_PTR_CASTTYPE
+#undef __CLC_AS_RETTYPE
+#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl
new file mode 100644
index 0000000000000..0dc44919627b3
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_add.h>
+
+#define FUNCTION __clc_atomic_fetch_add
+#define __IMPL_FUNCTION __scoped_atomic_fetch_add
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl
new file mode 100644
index 0000000000000..ec89738bc0f62
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_and.h>
+
+#define FUNCTION __clc_atomic_fetch_and
+#define __IMPL_FUNCTION __scoped_atomic_fetch_and
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl
new file mode 100644
index 0000000000000..0acac711aa96d
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_max.h>
+
+#define FUNCTION __clc_atomic_fetch_max
+#define __IMPL_FUNCTION __scoped_atomic_fetch_max
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl
new file mode 100644
index 0000000000000..7a098588ec005
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_min.h>
+
+#define FUNCTION __clc_atomic_fetch_min
+#define __IMPL_FUNCTION __scoped_atomic_fetch_min
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl
new file mode 100644
index 0000000000000..e0f48fa408350
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_or.h>
+
+#define FUNCTION __clc_atomic_fetch_or
+#define __IMPL_FUNCTION __scoped_atomic_fetch_or
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl
new file mode 100644
index 0000000000000..a4c2c1da1555c
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_sub.h>
+
+#define FUNCTION __clc_atomic_fetch_sub
+#define __IMPL_FUNCTION __scoped_atomic_fetch_sub
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl
new file mode 100644
index 0000000000000..4424a298178fd
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_fetch_xor.h>
+
+#define FUNCTION __clc_atomic_fetch_xor
+#define __IMPL_FUNCTION __scoped_atomic_fetch_xor
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl
new file mode 100644
index 0000000000000..019aa8d9d6dd8
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_inc.h>
+
+#define FUNCTION __clc_atomic_inc
+#define __IMPL_FUNCTION __scoped_atomic_fetch_sub
+#define __CLC_INC_DEC
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
new file mode 100644
index 0000000000000..1f083073e43ff
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_load.h>
+
+#define FUNCTION __clc_atomic_load
+#define __IMPL_FUNCTION __scoped_atomic_load_n
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_PTR_CASTTYPE
+#undef __CLC_AS_RETTYPE
+#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
new file mode 100644
index 0000000000000..8fd165b9a83b8
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_store.h>
+
+#define FUNCTION __clc_atomic_store
+#define __IMPL_FUNCTION __scoped_atomic_store_n
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_PTR_CASTTYPE
+#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+
+#define __CLC_BODY <clc_atomic_def.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bit_reverse.cl b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl
new file mode 100644
index 0000000000000..439957383f583
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/integer/clc_bit_reverse.h>
+
+#define FUNCTION __clc_bit_reverse
+#define __IMPL_FUNCTION(x) __builtin_elementwise_bitreverse
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/integer/gentype.inc>
diff --git a/libc/include/dlfcn.h.def b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl
similarity index 56%
rename from libc/include/dlfcn.h.def
rename to libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl
index 31395871c6b97..d779ed6a43593 100644
--- a/libc/include/dlfcn.h.def
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl
@@ -1,4 +1,4 @@
-//===-- C standard library header dlfcn.h ---------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_DLFCN_H
-#define LLVM_LIBC_DLFCN_H
-
-#include "__llvm-libc-common.h"
-#include "llvm-libc-macros/dlfcn-macros.h"
-
-%%public_api()
+#include <clc/integer/clc_bitfield_extract_signed.h>
 
-#endif // LLVM_LIBC_DLFCN_H
+#define __CLC_BODY <clc_bitfield_extract_signed.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc
new file mode 100644
index 0000000000000..84cae2166f7ce
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_S_GENTYPE
+__clc_bitfield_extract_signed(__CLC_GENTYPE base, uint offset, uint count) {
+  if (count == 0)
+    return 0;
+  __CLC_U_GENTYPE x = __CLC_AS_U_GENTYPE(base)
+                      << (__CLC_GENSIZE - offset - count);
+  // Implement an arithmetic shift right.
+  __CLC_U_GENTYPE s = -(x >> (__CLC_GENSIZE - 1));
+  __CLC_U_GENTYPE result = ((s ^ x) >> (__CLC_GENSIZE - count)) ^ s;
+  return __CLC_AS_S_GENTYPE(result);
+}
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl
new file mode 100644
index 0000000000000..bf7db401034dc
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl
@@ -0,0 +1,12 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/integer/clc_bitfield_extract_unsigned.h>
+
+#define __CLC_BODY <clc_bitfield_extract_unsigned.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc
new file mode 100644
index 0000000000000..bc81ce5c98b09
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE
+__clc_bitfield_extract_unsigned(__CLC_GENTYPE base, uint offset, uint count) {
+  if (count == 0)
+    return 0;
+  __CLC_U_GENTYPE result = __CLC_AS_U_GENTYPE(base)
+                           << (__CLC_GENSIZE - offset - count);
+  return result >> (__CLC_GENSIZE - count);
+}
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl b/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl
new file mode 100644
index 0000000000000..a40fc804f2187
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl
@@ -0,0 +1,12 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/integer/clc_bitfield_insert.h>
+
+#define __CLC_BODY <clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc b/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc
new file mode 100644
index 0000000000000..ad8dac28750cc
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_bitfield_insert(__CLC_GENTYPE base,
+                                                           __CLC_GENTYPE insert,
+                                                           uint offset,
+                                                           uint count) {
+  __CLC_U_GENTYPE u_base = __CLC_AS_U_GENTYPE(base);
+  __CLC_U_GENTYPE u_insert = __CLC_AS_U_GENTYPE(insert);
+  __CLC_U_GENTYPE mask = (((__CLC_U_GENTYPE)1 << count) - (__CLC_U_GENTYPE)1)
+                         << offset;
+  mask = count < __CLC_GENSIZE ? mask : ~(__CLC_U_GENTYPE)0;
+  __CLC_U_GENTYPE result = ((u_insert << offset) & mask) | (u_base & ~mask);
+  return __CLC_AS_GENTYPE(result);
+}
diff --git a/libclc/clc/lib/generic/math/clc_native_divide.inc b/libclc/clc/lib/generic/math/clc_native_divide.inc
index fdf1794812c5a..dac176fb986bd 100644
--- a/libclc/clc/lib/generic/math/clc_native_divide.inc
+++ b/libclc/clc/lib/generic/math/clc_native_divide.inc
@@ -8,5 +8,6 @@
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_divide(__CLC_GENTYPE x,
                                                          __CLC_GENTYPE y) {
+  _Pragma("clang fp reciprocal(on)");
   return x / y;
 }
diff --git a/libclc/clc/lib/generic/math/clc_native_recip.inc b/libclc/clc/lib/generic/math/clc_native_recip.inc
index 57eb35a9522f8..e7246dc08a77c 100644
--- a/libclc/clc/lib/generic/math/clc_native_recip.inc
+++ b/libclc/clc/lib/generic/math/clc_native_recip.inc
@@ -7,5 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_recip(__CLC_GENTYPE val) {
+  _Pragma("clang fp reciprocal(on)");
   return 1.0f / val;
 }
diff --git a/libclc/clc/lib/generic/math/clc_native_rsqrt.inc b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc
index 7a3b0b2af2721..2b2c4bdada9f9 100644
--- a/libclc/clc/lib/generic/math/clc_native_rsqrt.inc
+++ b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc
@@ -7,5 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_rsqrt(__CLC_GENTYPE val) {
+  _Pragma("clang fp reciprocal(on)");
   return 1.0f / __clc_native_sqrt(val);
 }
diff --git a/libclc/clc/lib/generic/math/clc_native_tan.inc b/libclc/clc/lib/generic/math/clc_native_tan.inc
index f61a78968a754..f0c6c6d37d2b7 100644
--- a/libclc/clc/lib/generic/math/clc_native_tan.inc
+++ b/libclc/clc/lib/generic/math/clc_native_tan.inc
@@ -7,5 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_tan(__CLC_GENTYPE val) {
+  _Pragma("clang fp reciprocal(on)");
   return __clc_native_sin(val) / __clc_native_cos(val);
 }
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h
index 821ae7aab05bf..50fb99d1362fc 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_add
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_ADD_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h
index d10cfed9b581a..8ce328c9739aa 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_and
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_AND_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h
new file mode 100644
index 0000000000000..76eeda7ba3469
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__
+
+#define FUNCTION atomic_compare_exchange_strong
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_COMPARE_EXCHANGE
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h
new file mode 100644
index 0000000000000..12788ad03a2d1
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__
+
+#define FUNCTION atomic_compare_exchange_weak
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_COMPARE_EXCHANGE
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc
index e060e3aaea161..1b2bf17bd6dfd 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc
@@ -6,17 +6,55 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define __CLC_DECLARE_ATOMIC(ADDRSPACE, TYPE)                                  \
-  _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE);
+#ifdef __CLC_SCALAR
 
-#define __CLC_DECLARE_ATOMIC_ADDRSPACE(TYPE)                                   \
-  __CLC_DECLARE_ATOMIC(global, TYPE)                                           \
-  __CLC_DECLARE_ATOMIC(local, TYPE)
+#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) &&         \
+                                 defined(cl_khr_int64_extended_atomics))
+#define HAVE_64_ATOMIC
+#endif
+#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC))
+#define HAVE_FP_ATOMIC
+#endif
+#if defined(__CLC_GENSIZE) &&                                                  \
+    ((__CLC_GENSIZE == 32) ||                                                  \
+     (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC)))
+#define HAVE_INT_ATOMIC
+#endif
+#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC)
 
-__CLC_DECLARE_ATOMIC_ADDRSPACE(int)
-__CLC_DECLARE_ATOMIC_ADDRSPACE(uint)
+#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE)
 
-#undef __CLC_DECLARE_ATOMIC_ADDRSPACE
-#undef __CLC_DECLARE_ATOMIC
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr);
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL void FUNCTION(                                       \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value);
+#elif defined(__CLC_COMPARE_EXCHANGE)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr,                            \
+      ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired);
+#else
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(                              \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value);
+#endif
 
-#undef FUNCTION
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC
+
+#undef HAVE_INT_ATOMIC
+#undef HAVE_FP_ATOMIC
+#undef HAVE_64_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc
new file mode 100644
index 0000000000000..e060e3aaea161
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define __CLC_DECLARE_ATOMIC(ADDRSPACE, TYPE)                                  \
+  _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE);
+
+#define __CLC_DECLARE_ATOMIC_ADDRSPACE(TYPE)                                   \
+  __CLC_DECLARE_ATOMIC(global, TYPE)                                           \
+  __CLC_DECLARE_ATOMIC(local, TYPE)
+
+__CLC_DECLARE_ATOMIC_ADDRSPACE(int)
+__CLC_DECLARE_ATOMIC_ADDRSPACE(uint)
+
+#undef __CLC_DECLARE_ATOMIC_ADDRSPACE
+#undef __CLC_DECLARE_ATOMIC
+
+#undef FUNCTION
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h
new file mode 100644
index 0000000000000..3949bc13401f2
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__
+
+#define FUNCTION atomic_exchange
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h
new file mode 100644
index 0000000000000..972c1fa69fe7b
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__
+
+#define FUNCTION atomic_fetch_add
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h
new file mode 100644
index 0000000000000..fdac049a74d3f
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__
+
+#define FUNCTION atomic_fetch_and
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h
new file mode 100644
index 0000000000000..513b60fec2727
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__
+
+#define FUNCTION atomic_fetch_max
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h
new file mode 100644
index 0000000000000..c961c4a641656
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__
+
+#define FUNCTION atomic_fetch_min
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h
new file mode 100644
index 0000000000000..25923e3647e36
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__
+
+#define FUNCTION atomic_fetch_or
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h
new file mode 100644
index 0000000000000..b307c30a298b3
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__
+
+#define FUNCTION atomic_fetch_sub
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h
new file mode 100644
index 0000000000000..52510d018574d
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__
+
+#define FUNCTION atomic_fetch_xor
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_load.h b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h
new file mode 100644
index 0000000000000..3998a4de9452b
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__
+
+#define FUNCTION atomic_load
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_NO_VALUE_ARG
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h
index 667fa36f16f9d..6b95ad7e68d94 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_max
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MAX_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h
index 91bb636eec875..c1dfacb40b746 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_min
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MIN_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h
index 5c03fd157a2bc..30c32fe4889d5 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_or
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_OR_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_store.h b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h
new file mode 100644
index 0000000000000..4893a5b88df03
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__
+
+#define FUNCTION atomic_store
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_RETURN_VOID
+#undef FUNCTION
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h
index 25ffe9ff4a9b7..1e7ac5505b071 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_sub
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_SUB_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h
index 6b4206dedb820..043d7825483e4 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h
@@ -15,6 +15,6 @@
 
 _CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile local float *, float);
 _CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile global float *, float);
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XCHG_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h
index e94560cb6b9ed..a9bee007b9344 100644
--- a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h
@@ -12,6 +12,6 @@
 #include <clc/opencl/opencl-base.h>
 
 #define FUNCTION atomic_xor
-#include <clc/opencl/atomic/atomic_decl.inc>
+#include <clc/opencl/atomic/atomic_decl_legacy.inc>
 
 #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XOR_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bit_reverse.h b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h
new file mode 100644
index 0000000000000..46b589557631d
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BIT_REVERSE_H__
+#define __CLC_OPENCL_INTEGER_BIT_REVERSE_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define FUNCTION bit_reverse
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BIT_REVERSE_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h
new file mode 100644
index 0000000000000..0a902b2a21d6d
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__
+#define __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define FUNCTION bitfield_extract_signed
+#define __RETTYPE __CLC_S_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h
new file mode 100644
index 0000000000000..28064c08b113e
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__
+#define __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define FUNCTION bitfield_extract_unsigned
+#define __RETTYPE __CLC_U_GENTYPE
+
+#define __CLC_BODY <clc/integer/clc_bitfield_extract_decl.inc>
+#include <clc/integer/gentype.inc>
+
+#undef __RETTYPE
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__
diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h
new file mode 100644
index 0000000000000..e77d7a4f0b957
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__
+#define __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/opencl/opencl-base.h>
+
+#define __CLC_BODY <clc/integer/clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif // cl_khr_extended_bit_ops
+
+#endif // __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__
diff --git a/libclc/opencl/include/clc/opencl/relational/isfinite.h b/libclc/opencl/include/clc/opencl/relational/isfinite.h
index 2548e6acf5109..ac3db6764073a 100644
--- a/libclc/opencl/include/clc/opencl/relational/isfinite.h
+++ b/libclc/opencl/include/clc/opencl/relational/isfinite.h
@@ -14,7 +14,7 @@
 #define FUNCTION isfinite
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isgreater.h b/libclc/opencl/include/clc/opencl/relational/isgreater.h
index 6dfe6eb810e2a..2230055115bcd 100644
--- a/libclc/opencl/include/clc/opencl/relational/isgreater.h
+++ b/libclc/opencl/include/clc/opencl/relational/isgreater.h
@@ -14,7 +14,7 @@
 #define FUNCTION isgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h
index 1db2c5d58d062..f99a620dabd78 100644
--- a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h
+++ b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h
@@ -14,7 +14,7 @@
 #define FUNCTION isgreaterequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isless.h b/libclc/opencl/include/clc/opencl/relational/isless.h
index 3e2afb32cddf4..74280e543e0b5 100644
--- a/libclc/opencl/include/clc/opencl/relational/isless.h
+++ b/libclc/opencl/include/clc/opencl/relational/isless.h
@@ -14,7 +14,7 @@
 #define FUNCTION isless
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/islessequal.h b/libclc/opencl/include/clc/opencl/relational/islessequal.h
index 978e6a9052c16..dcc26c37b73c1 100644
--- a/libclc/opencl/include/clc/opencl/relational/islessequal.h
+++ b/libclc/opencl/include/clc/opencl/relational/islessequal.h
@@ -14,7 +14,7 @@
 #define FUNCTION islessequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/islessgreater.h b/libclc/opencl/include/clc/opencl/relational/islessgreater.h
index 56cce7db20770..15a1eb5577531 100644
--- a/libclc/opencl/include/clc/opencl/relational/islessgreater.h
+++ b/libclc/opencl/include/clc/opencl/relational/islessgreater.h
@@ -14,7 +14,7 @@
 #define FUNCTION islessgreater
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isnormal.h b/libclc/opencl/include/clc/opencl/relational/isnormal.h
index ee74a990b5eaf..bbb06aad0df2a 100644
--- a/libclc/opencl/include/clc/opencl/relational/isnormal.h
+++ b/libclc/opencl/include/clc/opencl/relational/isnormal.h
@@ -14,7 +14,7 @@
 #define FUNCTION isnormal
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isnotequal.h b/libclc/opencl/include/clc/opencl/relational/isnotequal.h
index 7cf94e3ceec5f..c13aca8ef4be8 100644
--- a/libclc/opencl/include/clc/opencl/relational/isnotequal.h
+++ b/libclc/opencl/include/clc/opencl/relational/isnotequal.h
@@ -14,7 +14,7 @@
 #define FUNCTION isnotequal
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isordered.h b/libclc/opencl/include/clc/opencl/relational/isordered.h
index ad9770bd627f2..ea4ba3fa6fe8d 100644
--- a/libclc/opencl/include/clc/opencl/relational/isordered.h
+++ b/libclc/opencl/include/clc/opencl/relational/isordered.h
@@ -14,7 +14,7 @@
 #define FUNCTION isordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/isunordered.h b/libclc/opencl/include/clc/opencl/relational/isunordered.h
index 01d2f53837317..76bf85604d1c7 100644
--- a/libclc/opencl/include/clc/opencl/relational/isunordered.h
+++ b/libclc/opencl/include/clc/opencl/relational/isunordered.h
@@ -14,7 +14,7 @@
 #define FUNCTION isunordered
 #define __CLC_BODY <clc/relational/binary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/include/clc/opencl/relational/signbit.h b/libclc/opencl/include/clc/opencl/relational/signbit.h
index 29591c0c126a9..6ad6595c7e294 100644
--- a/libclc/opencl/include/clc/opencl/relational/signbit.h
+++ b/libclc/opencl/include/clc/opencl/relational/signbit.h
@@ -14,7 +14,7 @@
 #define FUNCTION signbit
 #define __CLC_BODY <clc/relational/unary_decl.inc>
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
 
 #undef FUNCTION
 
diff --git a/libclc/opencl/lib/generic/SOURCES b/libclc/opencl/lib/generic/SOURCES
index 46ce6d6e36c24..61757efbcaad7 100644
--- a/libclc/opencl/lib/generic/SOURCES
+++ b/libclc/opencl/lib/generic/SOURCES
@@ -8,24 +8,36 @@ atomic/atom_add.cl
 atomic/atom_and.cl
 atomic/atom_cmpxchg.cl
 atomic/atom_dec.cl
-atomic/atom_inc.cl
-atomic/atom_max.cl
-atomic/atom_min.cl
-atomic/atom_or.cl
-atomic/atom_sub.cl
-atomic/atom_xchg.cl
-atomic/atom_xor.cl
 atomic/atomic_add.cl
 atomic/atomic_and.cl
 atomic/atomic_cmpxchg.cl
+atomic/atomic_compare_exchange_strong.cl
+atomic/atomic_compare_exchange_weak.cl
 atomic/atomic_dec.cl
+atomic/atomic_exchange.cl
+atomic/atomic_fetch_add.cl
+atomic/atomic_fetch_and.cl
+atomic/atomic_fetch_max.cl
+atomic/atomic_fetch_min.cl
+atomic/atomic_fetch_or.cl
+atomic/atomic_fetch_sub.cl
+atomic/atomic_fetch_xor.cl
 atomic/atomic_inc.cl
+atomic/atomic_load.cl
 atomic/atomic_max.cl
 atomic/atomic_min.cl
 atomic/atomic_or.cl
+atomic/atomic_store.cl
 atomic/atomic_sub.cl
 atomic/atomic_xchg.cl
 atomic/atomic_xor.cl
+atomic/atom_inc.cl
+atomic/atom_max.cl
+atomic/atom_min.cl
+atomic/atom_or.cl
+atomic/atom_sub.cl
+atomic/atom_xchg.cl
+atomic/atom_xor.cl
 common/degrees.cl
 common/mix.cl
 common/radians.cl
@@ -43,6 +55,10 @@ geometric/normalize.cl
 integer/abs.cl
 integer/abs_diff.cl
 integer/add_sat.cl
+integer/bitfield_extract_signed.cl
+integer/bitfield_extract_unsigned.cl
+integer/bitfield_insert.cl
+integer/bit_reverse.cl
 integer/clz.cl
 integer/ctz.cl
 integer/hadd.cl
diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl
new file mode 100644
index 0000000000000..422c03f292071
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_compare_exchange.h>
+#include <clc/opencl/atomic/atomic_compare_exchange_strong.h>
+
+#define FUNCTION atomic_compare_exchange_strong
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl
new file mode 100644
index 0000000000000..8a6b3c4f0110e
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_compare_exchange.h>
+#include <clc/opencl/atomic/atomic_compare_exchange_weak.h>
+
+#define FUNCTION atomic_compare_exchange_weak
+#define __CLC_COMPARE_EXCHANGE
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_dec.cl b/libclc/opencl/lib/generic/atomic/atomic_dec.cl
index 6f18cdf13428a..6de55bc0b9845 100644
--- a/libclc/opencl/lib/generic/atomic/atomic_dec.cl
+++ b/libclc/opencl/lib/generic/atomic/atomic_dec.cl
@@ -6,15 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <clc/atomic/clc_atomic_dec.h>
 #include <clc/opencl/atomic/atomic_dec.h>
 
-#define IMPL(TYPE, AS)                                                         \
-  _CLC_OVERLOAD _CLC_DEF TYPE atomic_dec(volatile AS TYPE *p) {                \
-    return __sync_fetch_and_sub(p, (TYPE)1);                                   \
-  }
+#define FUNCTION atomic_dec
+#define __IMPL_FUNCTION __clc_atomic_dec
 
-IMPL(int, global)
-IMPL(unsigned int, global)
-IMPL(int, local)
-IMPL(unsigned int, local)
-#undef IMPL
+#define __CLC_BODY <atomic_inc_dec.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/opencl/lib/generic/atomic/atomic_def.inc b/libclc/opencl/lib/generic/atomic/atomic_def.inc
new file mode 100644
index 0000000000000..ce192bf844938
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_def.inc
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) &&         \
+                                 defined(cl_khr_int64_extended_atomics))
+#define HAVE_64_ATOMIC
+#endif
+#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC)
+#define HAVE_FP_ATOMIC
+#endif
+#if defined(__CLC_GENSIZE) &&                                                  \
+    ((__CLC_GENSIZE == 32) ||                                                  \
+     (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC)))
+#define HAVE_INT_ATOMIC
+#endif
+#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC)
+
+#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE)
+
+#ifdef __CLC_NO_VALUE_ARG
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr) {                          \
+    return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr,            \
+                           __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);           \
+  }
+#elif defined(__CLC_RETURN_VOID)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF void FUNCTION(                                        \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) {     \
+    __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value,            \
+                    __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);                  \
+  }
+#elif defined(__CLC_COMPARE_EXCHANGE)
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr,                            \
+      ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired) {              \
+    __CLC_GENTYPE Comparator = *Expected;                                      \
+    __CLC_GENTYPE RetValue = __clc_atomic_compare_exchange(                    \
+        (volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Comparator, Desired,          \
+        __ATOMIC_SEQ_CST, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);            \
+    if (Comparator != RetValue) {                                              \
+      *Expected = RetValue;                                                    \
+      return true;                                                             \
+    }                                                                          \
+    return false;                                                              \
+  }
+#else
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) {     \
+    return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value,     \
+                           __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);           \
+  }
+#endif
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC
+
+#undef HAVE_INT_ATOMIC
+#undef HAVE_FP_ATOMIC
+#undef HAVE_64_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/lib/generic/atomic/atomic_exchange.cl b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl
new file mode 100644
index 0000000000000..6dae6c0a77599
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_exchange.h>
+#include <clc/opencl/atomic/atomic_exchange.h>
+
+#define FUNCTION atomic_exchange
+#define __IMPL_FUNCTION __clc_atomic_exchange
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl
new file mode 100644
index 0000000000000..bbaa1c2b0dacf
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_add.h>
+#include <clc/opencl/atomic/atomic_fetch_add.h>
+
+#define FUNCTION atomic_fetch_add
+#define __IMPL_FUNCTION __clc_atomic_fetch_add
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl
new file mode 100644
index 0000000000000..73925844c9357
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_and.h>
+#include <clc/opencl/atomic/atomic_fetch_and.h>
+
+#define FUNCTION atomic_fetch_and
+#define __IMPL_FUNCTION __clc_atomic_fetch_and
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl
new file mode 100644
index 0000000000000..8c8ce11cc575f
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_max.h>
+#include <clc/opencl/atomic/atomic_fetch_max.h>
+
+#define FUNCTION atomic_fetch_max
+#define __IMPL_FUNCTION __clc_atomic_fetch_max
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl
new file mode 100644
index 0000000000000..550459cee32d6
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_min.h>
+#include <clc/opencl/atomic/atomic_fetch_min.h>
+
+#define FUNCTION atomic_fetch_min
+#define __IMPL_FUNCTION __clc_atomic_fetch_min
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl
new file mode 100644
index 0000000000000..2606ff3c99673
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_or.h>
+#include <clc/opencl/atomic/atomic_fetch_or.h>
+
+#define FUNCTION atomic_fetch_or
+#define __IMPL_FUNCTION __clc_atomic_fetch_or
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl
new file mode 100644
index 0000000000000..33772233bebed
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_sub.h>
+#include <clc/opencl/atomic/atomic_fetch_sub.h>
+
+#define FUNCTION atomic_fetch_sub
+#define __IMPL_FUNCTION __clc_atomic_fetch_sub
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl
new file mode 100644
index 0000000000000..6f6503e588b6f
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_fetch_xor.h>
+#include <clc/opencl/atomic/atomic_fetch_xor.h>
+
+#define FUNCTION atomic_fetch_xor
+#define __IMPL_FUNCTION __clc_atomic_fetch_xor
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc.cl b/libclc/opencl/lib/generic/atomic/atomic_inc.cl
index 13349e5432e5c..a160b2e2370fc 100644
--- a/libclc/opencl/lib/generic/atomic/atomic_inc.cl
+++ b/libclc/opencl/lib/generic/atomic/atomic_inc.cl
@@ -6,15 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <clc/atomic/clc_atomic_inc.h>
 #include <clc/opencl/atomic/atomic_inc.h>
 
-#define IMPL(TYPE, AS)                                                         \
-  _CLC_OVERLOAD _CLC_DEF TYPE atomic_inc(volatile AS TYPE *p) {                \
-    return __sync_fetch_and_add(p, (TYPE)1);                                   \
-  }
+#define FUNCTION atomic_inc
+#define __IMPL_FUNCTION __clc_atomic_inc
 
-IMPL(int, global)
-IMPL(unsigned int, global)
-IMPL(int, local)
-IMPL(unsigned int, local)
-#undef IMPL
+#define __CLC_BODY <atomic_inc_dec.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc
new file mode 100644
index 0000000000000..0bcf300dd284a
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if __CLC_GENSIZE == 32
+
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(                               \
+      volatile ADDRSPACE __CLC_GENTYPE *Ptr) {                                 \
+    return __IMPL_FUNCTION(Ptr, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);      \
+  }
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // __CLC_GENSIZE == 32
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/lib/generic/atomic/atomic_load.cl b/libclc/opencl/lib/generic/atomic/atomic_load.cl
new file mode 100644
index 0000000000000..459265473a8c8
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_load.cl
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_load.h>
+#include <clc/opencl/atomic/atomic_load.h>
+
+#define FUNCTION atomic_load
+#define __IMPL_FUNCTION __clc_atomic_load
+#define __CLC_NO_VALUE_ARG
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/atomic/atomic_store.cl b/libclc/opencl/lib/generic/atomic/atomic_store.cl
new file mode 100644
index 0000000000000..67f2c8457fc10
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_store.cl
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#include <clc/atomic/clc_atomic_store.h>
+#include <clc/opencl/atomic/atomic_store.h>
+
+#define FUNCTION atomic_store
+#define __IMPL_FUNCTION __clc_atomic_store
+#define __CLC_RETURN_VOID
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_def.inc>
+#include <clc/math/gentype.inc>
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
diff --git a/libclc/opencl/lib/generic/integer/bit_reverse.cl b/libclc/opencl/lib/generic/integer/bit_reverse.cl
new file mode 100644
index 0000000000000..23181b6b3eba5
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bit_reverse.cl
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bit_reverse.h>
+#include <clc/opencl/integer/bit_reverse.h>
+
+#define FUNCTION bit_reverse
+#define __CLC_BODY <clc/shared/unary_def.inc>
+
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc
new file mode 100644
index 0000000000000..0262f67732afc
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __IMPL_FUNCTION
+#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE base, uint offset,
+                                          uint count) {
+  return __IMPL_FUNCTION(FUNCTION)(base, offset, count);
+}
diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl
new file mode 100644
index 0000000000000..eaa4ac779cfd1
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bitfield_extract_signed.h>
+#include <clc/opencl/integer/bitfield_extract_signed.h>
+
+#define FUNCTION bitfield_extract_signed
+#define __RETTYPE __CLC_S_GENTYPE
+
+#define __CLC_BODY <bitfield_extract_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl
new file mode 100644
index 0000000000000..fd63d5d6dee30
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bitfield_extract_unsigned.h>
+#include <clc/opencl/integer/bitfield_extract_unsigned.h>
+
+#define FUNCTION bitfield_extract_unsigned
+#define __RETTYPE __CLC_U_GENTYPE
+
+#define __CLC_BODY <bitfield_extract_def.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_insert.cl b/libclc/opencl/lib/generic/integer/bitfield_insert.cl
new file mode 100644
index 0000000000000..6b441155f393b
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_insert.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef cl_khr_extended_bit_ops
+
+#include <clc/integer/clc_bitfield_insert.h>
+#include <clc/opencl/integer/bitfield_insert.h>
+
+#define FUNCTION bitfield_insert
+#define __CLC_BODY <clc/integer/clc_bitfield_insert.inc>
+#include <clc/integer/gentype.inc>
+
+#endif // cl_khr_extended_bit_ops
diff --git a/libclc/opencl/lib/generic/integer/bitfield_insert.inc b/libclc/opencl/lib/generic/integer/bitfield_insert.inc
new file mode 100644
index 0000000000000..b1f45907a4361
--- /dev/null
+++ b/libclc/opencl/lib/generic/integer/bitfield_insert.inc
@@ -0,0 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE bitfield_insert(__CLC_GENTYPE base,
+                                                     __CLC_GENTYPE insert,
+                                                     uint offset, uint count) {
+  return __clc_bitfield_insert(base, insert, offset, count);
+}
diff --git a/libclc/opencl/lib/generic/relational/binary_def.inc b/libclc/opencl/lib/generic/relational/binary_def.inc
index 54bb237b8f8f5..8416da0475a2c 100644
--- a/libclc/opencl/lib/generic/relational/binary_def.inc
+++ b/libclc/opencl/lib/generic/relational/binary_def.inc
@@ -10,6 +10,14 @@
 
 #define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x)
 
-_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b) {
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b) {
   return __IMPL_FUNCTION(FUNCTION)(a, b);
 }
+
+#undef __RETTYPE
diff --git a/libclc/opencl/lib/generic/relational/isequal.cl b/libclc/opencl/lib/generic/relational/isequal.cl
index 94f83f9452666..83002c28ceab3 100644
--- a/libclc/opencl/lib/generic/relational/isequal.cl
+++ b/libclc/opencl/lib/generic/relational/isequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isfinite.cl b/libclc/opencl/lib/generic/relational/isfinite.cl
index 695ffea806d5c..a2017133cead8 100644
--- a/libclc/opencl/lib/generic/relational/isfinite.cl
+++ b/libclc/opencl/lib/generic/relational/isfinite.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isfinite
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isgreater.cl b/libclc/opencl/lib/generic/relational/isgreater.cl
index fb46ff20ac608..6eeb2b21c0493 100644
--- a/libclc/opencl/lib/generic/relational/isgreater.cl
+++ b/libclc/opencl/lib/generic/relational/isgreater.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isgreater
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isgreaterequal.cl b/libclc/opencl/lib/generic/relational/isgreaterequal.cl
index b8edde2a05b77..e4e4535fd30d3 100644
--- a/libclc/opencl/lib/generic/relational/isgreaterequal.cl
+++ b/libclc/opencl/lib/generic/relational/isgreaterequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isgreaterequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isinf.cl b/libclc/opencl/lib/generic/relational/isinf.cl
index 2c15f1f826762..2ab8c182e02a6 100644
--- a/libclc/opencl/lib/generic/relational/isinf.cl
+++ b/libclc/opencl/lib/generic/relational/isinf.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isinf
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isless.cl b/libclc/opencl/lib/generic/relational/isless.cl
index 0af1f53e71042..4212970e7671a 100644
--- a/libclc/opencl/lib/generic/relational/isless.cl
+++ b/libclc/opencl/lib/generic/relational/isless.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isless
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/islessequal.cl b/libclc/opencl/lib/generic/relational/islessequal.cl
index 9e32afc718ab2..e7aec262fc762 100644
--- a/libclc/opencl/lib/generic/relational/islessequal.cl
+++ b/libclc/opencl/lib/generic/relational/islessequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION islessequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/islessgreater.cl b/libclc/opencl/lib/generic/relational/islessgreater.cl
index c36a857dc3dfc..b775d2484550c 100644
--- a/libclc/opencl/lib/generic/relational/islessgreater.cl
+++ b/libclc/opencl/lib/generic/relational/islessgreater.cl
@@ -12,4 +12,4 @@
 #define FUNCTION islessgreater
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isnan.cl b/libclc/opencl/lib/generic/relational/isnan.cl
index 8b03930c5312f..4b7eeb5b919b6 100644
--- a/libclc/opencl/lib/generic/relational/isnan.cl
+++ b/libclc/opencl/lib/generic/relational/isnan.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isnan
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isnormal.cl b/libclc/opencl/lib/generic/relational/isnormal.cl
index 4ba21cc3e17fc..60ce9dccaeaf3 100644
--- a/libclc/opencl/lib/generic/relational/isnormal.cl
+++ b/libclc/opencl/lib/generic/relational/isnormal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isnormal
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isnotequal.cl b/libclc/opencl/lib/generic/relational/isnotequal.cl
index 928923b9b2a5e..abb4d3a859663 100644
--- a/libclc/opencl/lib/generic/relational/isnotequal.cl
+++ b/libclc/opencl/lib/generic/relational/isnotequal.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isnotequal
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isordered.cl b/libclc/opencl/lib/generic/relational/isordered.cl
index 60ca4d67ff1ea..684ee425e1203 100644
--- a/libclc/opencl/lib/generic/relational/isordered.cl
+++ b/libclc/opencl/lib/generic/relational/isordered.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isordered
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/isunordered.cl b/libclc/opencl/lib/generic/relational/isunordered.cl
index 3392d77856ced..84aa8cafb111a 100644
--- a/libclc/opencl/lib/generic/relational/isunordered.cl
+++ b/libclc/opencl/lib/generic/relational/isunordered.cl
@@ -12,4 +12,4 @@
 #define FUNCTION isunordered
 #define __CLC_BODY "binary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/signbit.cl b/libclc/opencl/lib/generic/relational/signbit.cl
index 26feb8d43fa25..d30fea7b9f6f5 100644
--- a/libclc/opencl/lib/generic/relational/signbit.cl
+++ b/libclc/opencl/lib/generic/relational/signbit.cl
@@ -12,4 +12,4 @@
 #define FUNCTION signbit
 #define __CLC_BODY "unary_def.inc"
 
-#include <clc/relational/floatn.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/relational/unary_def.inc b/libclc/opencl/lib/generic/relational/unary_def.inc
index 47bb33ef2da3d..f184e3cf0be56 100644
--- a/libclc/opencl/lib/generic/relational/unary_def.inc
+++ b/libclc/opencl/lib/generic/relational/unary_def.inc
@@ -10,6 +10,14 @@
 
 #define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x)
 
-_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a) {
+#if __CLC_VECSIZE_OR_1 == 1
+#define __RETTYPE __CLC_INTN
+#else
+#define __RETTYPE __CLC_BIT_INTN
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a) {
   return __IMPL_FUNCTION(FUNCTION)(a);
 }
+
+#undef __RETTYPE
diff --git a/libclc/utils/CMakeLists.txt b/libclc/utils/CMakeLists.txt
new file mode 100644
index 0000000000000..ea1d9e9c8ef5f
--- /dev/null
+++ b/libclc/utils/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Construct LLVM version define
+set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
+
+# Setup prepare_builtins tools
+set( LLVM_LINK_COMPONENTS
+  BitReader
+  BitWriter
+  Core
+  IRReader
+  Support
+)
+
+if( LIBCLC_STANDALONE_BUILD )
+  add_llvm_executable( prepare_builtins prepare-builtins.cpp )
+  set( prepare_builtins_exe prepare_builtins )
+  set( prepare_builtins_target prepare_builtins )
+else()
+  add_llvm_utility( prepare_builtins prepare-builtins.cpp )
+  setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
+endif()
+
+target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
+# These were not properly reported in early LLVM and we don't need them
+target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv
index 3ca3e22845cfd..15f4b28575b6a 100644
--- a/libcxx/docs/Status/Cxx17Issues.csv
+++ b/libcxx/docs/Status/Cxx17Issues.csv
@@ -158,14 +158,14 @@
 "`LWG2683 <https://wg21.link/LWG2683>`__","filesystem::copy() says ""no effects""","2016-06 (Oulu)","|Complete|","",""
 "`LWG2684 <https://wg21.link/LWG2684>`__","priority_queue lacking comparator typedef","2016-06 (Oulu)","|Complete|","",""
 "`LWG2685 <https://wg21.link/LWG2685>`__","shared_ptr deleters must not throw on move construction","2016-06 (Oulu)","|Complete|","",""
-"`LWG2687 <https://wg21.link/LWG2687>`__","LWG2687: {inclusive,exclusive}_scan misspecified","2016-06 (Oulu)","|Complete|","",""
+"`LWG2687 <https://wg21.link/LWG2687>`__","{inclusive,exclusive}_scan misspecified","2016-06 (Oulu)","|Complete|","",""
 "`LWG2688 <https://wg21.link/LWG2688>`__","clamp misses preconditions and has extraneous condition on result","2016-06 (Oulu)","|Complete|","",""
 "`LWG2689 <https://wg21.link/LWG2689>`__","Parallel versions of std::copy and std::move shouldn't be in order","2016-06 (Oulu)","|Nothing To Do|","",""
 "`LWG2698 <https://wg21.link/LWG2698>`__","Effect of assign() on iterators/pointers/references","2016-06 (Oulu)","|Complete|","",""
 "`LWG2704 <https://wg21.link/LWG2704>`__","recursive_directory_iterator's members should require '``*this`` is dereferenceable'","2016-06 (Oulu)","|Complete|","",""
 "`LWG2706 <https://wg21.link/LWG2706>`__","Error reporting for recursive_directory_iterator::pop() is under-specified","2016-06 (Oulu)","|Complete|","",""
 "`LWG2707 <https://wg21.link/LWG2707>`__","path construction and assignment should have ""string_type&&"" overloads","2016-06 (Oulu)","|Complete|","",""
-"`LWG2709 <https://wg21.link/LWG2709>`__","LWG2709: offsetof is unnecessarily imprecise","2016-06 (Oulu)","|Nothing To Do|","",""
+"`LWG2709 <https://wg21.link/LWG2709>`__","offsetof is unnecessarily imprecise","2016-06 (Oulu)","|Nothing To Do|","",""
 "`LWG2710 <https://wg21.link/LWG2710>`__","""Effects: Equivalent to ..."" doesn't count ""Synchronization:"" as determined semantics","2016-06 (Oulu)","|Complete|","",""
 "`LWG2711 <https://wg21.link/LWG2711>`__","path is convertible from approximately everything under the sun","2016-06 (Oulu)","|Complete|","",""
 "`LWG2716 <https://wg21.link/LWG2716>`__","Specification of shuffle and sample disallows lvalue URNGs","2016-06 (Oulu)","|Complete|","",""
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index cb3248a4155ba..98b49f92bdf7a 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -237,7 +237,7 @@
 "`LWG3310 <https://wg21.link/LWG3310>`__","Replace ``SIZE_MAX``\  with ``numeric_limits<size_t>::max()``\ ","2020-02 (Prague)","|Complete|","16",""
 "`LWG3313 <https://wg21.link/LWG3313>`__","``join_view::iterator::operator--``\  is incorrectly constrained","2020-02 (Prague)","|Complete|","14",""
 "`LWG3314 <https://wg21.link/LWG3314>`__","Is stream insertion behavior locale dependent when ``Period::type``\  is ``micro``\ ?","2020-02 (Prague)","|Complete|","16",""
-"`LWG3315 <https://wg21.link/LWG3315>`__","LWG3315: Correct Allocator Default Behavior","2020-02 (Prague)","|Complete|","",""
+"`LWG3315 <https://wg21.link/LWG3315>`__","Correct Allocator Default Behavior","2020-02 (Prague)","|Complete|","",""
 "`LWG3316 <https://wg21.link/LWG3316>`__","Correctly define epoch for ``utc_clock``\  / ``utc_timepoint``\ ","2020-02 (Prague)","","",""
 "`LWG3317 <https://wg21.link/LWG3317>`__","Incorrect ``operator<<``\  for floating-point durations","2020-02 (Prague)","|Complete|","16",""
 "`LWG3318 <https://wg21.link/LWG3318>`__","Clarify whether clocks can represent time before their epoch","2020-02 (Prague)","","",""
@@ -294,7 +294,7 @@
 "`LWG3389 <https://wg21.link/LWG3389>`__","A move-only iterator still does not have a ``counted_iterator``\ ","2020-02 (Prague)","|Complete|","15",""
 "`LWG3390 <https://wg21.link/LWG3390>`__","``make_move_iterator()``\  cannot be used to construct a ``move_iterator``\  for a move-only iterator","2020-02 (Prague)","|Complete|","14",""
 "`LWG3393 <https://wg21.link/LWG3393>`__","Missing/incorrect feature test macro for coroutines","2020-02 (Prague)","|Complete|","14",""
-"`LWG3395 <https://wg21.link/LWG3395>`__","LWG3395: Definition for three-way comparison needs to be updated (US 152)","2020-02 (Prague)","|Nothing To Do|","",""
+"`LWG3395 <https://wg21.link/LWG3395>`__","Definition for three-way comparison needs to be updated (US 152)","2020-02 (Prague)","|Nothing To Do|","",""
 "`LWG3396 <https://wg21.link/LWG3396>`__","Clarify point of reference for ``source_location::current()``\  (DE 169)","2020-02 (Prague)","|Nothing To Do|","16",""
 "`LWG3397 <https://wg21.link/LWG3397>`__","``ranges::basic_istream_view::iterator``\  should not provide ``iterator_category``\ ","2020-02 (Prague)","|Complete|","16",""
 "`LWG3398 <https://wg21.link/LWG3398>`__","``tuple_element_t``\  is also wrong for ``const subrange``\ ","2020-02 (Prague)","|Complete|","14",""
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index c8a3aa95b7480..6fcb2f3c78cfc 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -36,7 +36,7 @@
 "`LWG3970 <https://wg21.link/LWG3970>`__","[mdspan.syn] Missing definition of ``full_extent_t`` and ``full_extent``","2023-11 (Kona)","","",""
 "`LWG3973 <https://wg21.link/LWG3973>`__","Monadic operations should be ADL-proof","2023-11 (Kona)","","",""
 "`LWG3974 <https://wg21.link/LWG3974>`__","``mdspan::operator[]`` should not copy ``OtherIndexTypes``","2023-11 (Kona)","","",""
-"`LWG3987 <https://wg21.link/LWG3987>`__","LWG3987: Including `<flat_foo>` doesn't provide `std::begin`/`end`","2023-11 (Kona)","|Complete|","",""
+"`LWG3987 <https://wg21.link/LWG3987>`__","Including `<flat_foo>` doesn't provide `std::begin`/`end`","2023-11 (Kona)","|Complete|","",""
 "`LWG3990 <https://wg21.link/LWG3990>`__","Program-defined specializations of ``std::tuple`` and ``std::variant`` can't be properly supported","2023-11 (Kona)","|Complete|","21",""
 "`LWG4001 <https://wg21.link/LWG4001>`__","``iota_view`` should provide ``empty``","2023-11 (Kona)","|Complete|","19",""
 "","","","","",""
@@ -45,7 +45,7 @@
 "`LWG3950 <https://wg21.link/LWG3950>`__","``std::basic_string_view`` comparison operators are overspecified","2024-03 (Tokyo)","|Complete|","18",""
 "`LWG3975 <https://wg21.link/LWG3975>`__","Specializations of ``basic_format_context`` should not be permitted","2024-03 (Tokyo)","|Nothing To Do|","",""
 "`LWG3984 <https://wg21.link/LWG3984>`__","``ranges::to``'s recursion branch may be ill-formed","2024-03 (Tokyo)","|Complete|","19",""
-"`LWG4011 <https://wg21.link/LWG4011>`__","``""Effects: Equivalent to return""`` in ``[span.elem]``","2024-03 (Tokyo)","|Nothing To Do|","",""
+"`LWG4011 <https://wg21.link/LWG4011>`__","""`Effects`: Equivalent to return"" in ``[span.elem]``","2024-03 (Tokyo)","|Nothing To Do|","",""
 "`LWG4012 <https://wg21.link/LWG4012>`__","``common_view::begin/end`` are missing the ``simple-view`` check","2024-03 (Tokyo)","","",""
 "`LWG4013 <https://wg21.link/LWG4013>`__","``lazy_split_view::outer-iterator::value_type`` should not provide default constructor","2024-03 (Tokyo)","","",""
 "`LWG4016 <https://wg21.link/LWG4016>`__","container-insertable checks do not match what container-inserter does","2024-03 (Tokyo)","|Complete|","20",""
@@ -92,7 +92,7 @@
 "`LWG4085 <https://wg21.link/LWG4085>`__","``ranges::generate_random``'s helper lambda should specify the return type","2024-11 (Wrocław)","","",""
 "`LWG4088 <https://wg21.link/LWG4088>`__","``println`` ignores the locale imbued in ``std::ostream``","2024-11 (Wrocław)","|Complete|","18",""
 "`LWG4112 <https://wg21.link/LWG4112>`__","``has-arrow`` should required ``operator->()`` to be ``const``-qualified","2024-11 (Wrocław)","","",""
-"`LWG4113 <https://wg21.link/LWG4113>`__","LWG4113: Disallow `has_unique_object_representations<Incomplete[]>`","2024-11 (Wrocław)","|Complete|","",""
+"`LWG4113 <https://wg21.link/LWG4113>`__","Disallow ``has_unique_object_representations<Incomplete[]>``","2024-11 (Wrocław)","|Complete|","",""
 "`LWG4119 <https://wg21.link/LWG4119>`__","``generator::promise_type::yield_value(ranges::elements_of<R, Alloc>)``'s nested ``generator`` may be ill-formed","2024-11 (Wrocław)","","",""
 "`LWG4124 <https://wg21.link/LWG4124>`__","Cannot format ``zoned_time`` with resolution coarser than ``seconds``","2024-11 (Wrocław)","","",""
 "`LWG4126 <https://wg21.link/LWG4126>`__","Some feature-test macros for fully freestanding features are not yet marked freestanding","2024-11 (Wrocław)","","",""
@@ -125,7 +125,7 @@
 "`LWG4202 <https://wg21.link/LWG4202>`__","``enable-sender`` should be a variable template","2025-06 (Sofia)","","",""
 "`LWG4203 <https://wg21.link/LWG4203>`__","Constraints on ``get-state`` functions are incorrect","2025-06 (Sofia)","","",""
 "`LWG4204 <https://wg21.link/LWG4204>`__","specification of ``as-sndr2(Sig)`` in [exec.let] is incomplete","2025-06 (Sofia)","","",""
-"`LWG4205 <https://wg21.link/LWG4205>`__","let_[*].transform_env is specified in terms of the let_* sender itself instead of its child","2025-06 (Sofia)","","",""
+"`LWG4205 <https://wg21.link/LWG4205>`__","``let_[*].transform_env`` is specified in terms of the ``let_*`` sender itself instead of its child","2025-06 (Sofia)","","",""
 "`LWG4208 <https://wg21.link/LWG4208>`__","Wording needs to ensure that in ``connect(sndr, rcvr)`` that ``rcvr`` expression is only evaluated once","2025-06 (Sofia)","","",""
 "`LWG4209 <https://wg21.link/LWG4209>`__","``default_domain::transform_env`` should be returning ``FWD-ENV(env)``","2025-06 (Sofia)","","",""
 "`LWG4188 <https://wg21.link/LWG4188>`__","``ostream::sentry`` destructor should handle exceptions","2025-06 (Sofia)","","",""
@@ -147,6 +147,6 @@
 "`LWG4247 <https://wg21.link/LWG4247>`__","Header ``<stdbit.h>`` is not yet freestanding","2025-06 (Sofia)","","",""
 "","","","","",""
 "`LWG3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16",""
-"`LWG4139 <https://wg21.link/LWG4139>`__","§[time.zone.leap] recursive constraint in <=>","Not Adopted Yet","|Complete|","20",""
-"`LWG3456 <https://wg21.link/LWG3456>`__","Pattern used by std::from_chars is underspecified (option B)","Not Adopted Yet","|Complete|","20",""
+"`LWG4139 <https://wg21.link/LWG4139>`__","§[time.zone.leap] recursive constraint in ``<=>``","Not Adopted Yet","|Complete|","20",""
+"`LWG3456 <https://wg21.link/LWG3456>`__","Pattern used by ``std::from_chars`` is underspecified (option B)","Not Adopted Yet","|Complete|","20",""
 "","","","","",""
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index a1854a6acc41a..febb0c176f9c4 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -59,7 +59,7 @@
 "`P2248R8 <https://wg21.link/P2248R8>`__","Enabling list-initialization for algorithms","2024-03 (Tokyo)","","",""
 "`P2810R4 <https://wg21.link/P2810R4>`__","``is_debugger_present`` ``is_replaceable``","2024-03 (Tokyo)","","",""
 "`P1068R11 <https://wg21.link/P1068R11>`__","Vector API for random number generation","2024-03 (Tokyo)","","",""
-"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","The changes to ``optional`` and ``tuple``'s equality overload from P2165R4 are not yet implemented"
+"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented."
 "`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","",""
 "`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19",""
 "","","","","",""
@@ -120,8 +120,9 @@
 "`P2996R13 <https://wg21.link/P2996R13>`__","Reflection for C++26","2025-06 (Sofia)","","",""
 "`P3394R4 <https://wg21.link/P3394R4>`__","Annotations for Reflection","2025-06 (Sofia)","","",""
 "`P3293R3 <https://wg21.link/P3293R3>`__","Splicing a base class subobject","2025-06 (Sofia)","","",""
+"`P3491R3 <https://wg21.link/P3491R3>`__","``define_static_{string,object,array}``","2025-06 (Sofia)","","",""
 "`P3096R12 <https://wg21.link/P3096R12>`__","Function Parameter Reflection in Reflection for C++26","2025-06 (Sofia)","","",""
-"`P2988R12 <https://wg21.link/P2988R12>`__","``std::optional<‍T&‍>``","2025-06 (Sofia)","","",""
+"`P2988R12 <https://wg21.link/P2988R12>`__","``std::optional<T&>``","2025-06 (Sofia)","","",""
 "`P3348R4 <https://wg21.link/P3348R4>`__","C++26 should refer to C23 not C17","2025-06 (Sofia)","","",""
 "`P3037R6 <https://wg21.link/P3037R6>`__","``constexpr`` ``std::shared_ptr`` and friends","2025-06 (Sofia)","","",""
 "`P3284R4 <https://wg21.link/P3284R4>`__","``write_env`` and ``unstoppable`` Sender Adaptors","2025-06 (Sofia)","","",""
@@ -141,7 +142,7 @@
 "`P3481R5 <https://wg21.link/P3481R5>`__","``std::execution::bulk()`` issues","2025-06 (Sofia)","","",""
 "`P3433R1 <https://wg21.link/P3433R1>`__","Allocator Support for Operation States","2025-06 (Sofia)","","",""
 "`P3149R11 <https://wg21.link/P3149R11>`__","``async_scope`` - Creating scopes for non-sequential concurrency","2025-06 (Sofia)","","",""
-"`P3682R0 <https://wg21.link/P3682R0>`__"," Remove ``std::execution::split``","2025-06 (Sofia)","","",""
+"`P3682R0 <https://wg21.link/P3682R0>`__","Remove ``std::execution::split``","2025-06 (Sofia)","","",""
 "`P2079R10 <https://wg21.link/P2079R10>`__","Parallel scheduler","2025-06 (Sofia)","","",""
 "`P3557R3 <https://wg21.link/P3557R3>`__","High-Quality Sender Diagnostics with Constexpr Exceptions","2025-06 (Sofia)","","",""
 "`P3560R2 <https://wg21.link/P3560R2>`__","Error Handling in Reflection","2025-06 (Sofia)","","",""
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index ae9cc87c797f8..a44c3161534b3 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -133,7 +133,7 @@ velocity, libc++ drops support for older compilers as newer ones are released.
 Compiler     Versions            Restrictions               Support policy
 ============ =================== ========================== =====================
 Clang        19, 20, 21-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
-AppleClang   15                                             latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
+AppleClang   16.4                                           latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1.3 (AIX)                                   latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
 GCC          15                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
 ============ =================== ========================== =====================
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 4f2a8dddad92c..25b567df2dd33 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -403,6 +403,7 @@ set(files
   __format/indic_conjunct_break_table.h
   __format/parser_std_format_spec.h
   __format/range_default_formatter.h
+  __format/range_format.h
   __format/range_formatter.h
   __format/unicode.h
   __format/width_estimation_table.h
@@ -514,7 +515,6 @@ set(files
   __locale_dir/check_grouping.h
   __locale_dir/get_c_locale.h
   __locale_dir/locale_base_api.h
-  __locale_dir/locale_base_api/android.h
   __locale_dir/locale_base_api/bsd_locale_fallbacks.h
   __locale_dir/locale_base_api/ibm.h
   __locale_dir/locale_base_api/musl.h
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 47942a09e67c5..07fef20f6166d 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -26,9 +26,7 @@ _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
 // TODO: Find out how altivec changes things and allow vectorizations there too.
-// TODO: Simplify this condition once we stop building with AppleClang 15 in the CI.
-#if _LIBCPP_STD_VER >= 14 && defined(_LIBCPP_COMPILER_CLANG_BASED) && !defined(__ALTIVEC__) &&                         \
-    !(defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 1600)
+#if _LIBCPP_STD_VER >= 14 && defined(_LIBCPP_COMPILER_CLANG_BASED) && !defined(__ALTIVEC__)
 #  define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 1
 #else
 #  define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 0
diff --git a/libcxx/include/__config b/libcxx/include/__config
index d940461c30234..ee06abfba7a08 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -28,7 +28,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 210000
+#  define _LIBCPP_VERSION 220000
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
@@ -416,6 +416,12 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_GCC_DIAGNOSTIC_IGNORED(str)
 #  endif
 
+// Macros to enter and leave a state where deprecation warnings are suppressed.
+#  define _LIBCPP_SUPPRESS_DEPRECATED_PUSH                                                                             \
+    _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated")                                           \
+        _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
+#  define _LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_DIAGNOSTIC_POP
+
 #  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_FAST
 #    define _LIBCPP_HARDENING_SIG f
 #  elif _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_EXTENSIVE
@@ -713,17 +719,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_WITH_CHAR8_T
 #  endif
 
-// Macros to enter and leave a state where deprecation warnings are suppressed.
-#  if defined(_LIBCPP_COMPILER_CLANG_BASED) || defined(_LIBCPP_COMPILER_GCC)
-#    define _LIBCPP_SUPPRESS_DEPRECATED_PUSH                                                                           \
-      _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated\"")                                \
-          _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
-#    define _LIBCPP_SUPPRESS_DEPRECATED_POP _Pragma("GCC diagnostic pop")
-#  else
-#    define _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-#    define _LIBCPP_SUPPRESS_DEPRECATED_POP
-#  endif
-
 #  if _LIBCPP_STD_VER <= 11
 #    define _LIBCPP_EXPLICIT_SINCE_CXX14
 #  else
diff --git a/libcxx/include/__configuration/compiler.h b/libcxx/include/__configuration/compiler.h
index 4b6c6cad353e3..11c07ed0dc474 100644
--- a/libcxx/include/__configuration/compiler.h
+++ b/libcxx/include/__configuration/compiler.h
@@ -37,7 +37,7 @@
 #      warning "Libc++ only supports Clang 19 and later"
 #    endif
 #  elif defined(_LIBCPP_APPLE_CLANG_VER)
-#    if _LIBCPP_APPLE_CLANG_VER < 1500
+#    if _LIBCPP_APPLE_CLANG_VER < 1600
 #      warning "Libc++ only supports AppleClang 15 and later"
 #    endif
 #  elif defined(_LIBCPP_GCC_VER)
diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h
index 7149debb2f141..2769647ad527e 100644
--- a/libcxx/include/__format/range_default_formatter.h
+++ b/libcxx/include/__format/range_default_formatter.h
@@ -16,10 +16,10 @@
 
 #include <__algorithm/ranges_copy.h>
 #include <__chrono/statically_widen.h>
-#include <__concepts/same_as.h>
 #include <__config>
 #include <__format/concepts.h>
 #include <__format/formatter.h>
+#include <__format/range_format.h>
 #include <__format/range_formatter.h>
 #include <__iterator/back_insert_iterator.h>
 #include <__ranges/concepts.h>
@@ -42,51 +42,11 @@ concept __const_formattable_range =
 template <class _Rp, class _CharT>
 using __fmt_maybe_const _LIBCPP_NODEBUG = conditional_t<__const_formattable_range<_Rp, _CharT>, const _Rp, _Rp>;
 
-_LIBCPP_DIAGNOSTIC_PUSH
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow")
-_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wshadow")
-// This shadows map, set, and string.
-enum class range_format { disabled, map, set, sequence, string, debug_string };
-_LIBCPP_DIAGNOSTIC_POP
-
 // There is no definition of this struct, it's purely intended to be used to
 // generate diagnostics.
 template <class _Rp>
 struct __instantiated_the_primary_template_of_format_kind;
 
-template <class _Rp>
-constexpr range_format format_kind = [] {
-  // [format.range.fmtkind]/1
-  // A program that instantiates the primary template of format_kind is ill-formed.
-  static_assert(sizeof(_Rp) != sizeof(_Rp), "create a template specialization of format_kind for your type");
-  return range_format::disabled;
-}();
-
-template <ranges::input_range _Rp>
-  requires same_as<_Rp, remove_cvref_t<_Rp>>
-inline constexpr range_format format_kind<_Rp> = [] {
-  // [format.range.fmtkind]/2
-
-  // 2.1 If same_as<remove_cvref_t<ranges::range_reference_t<R>>, R> is true,
-  // Otherwise format_kind<R> is range_format::disabled.
-  if constexpr (same_as<remove_cvref_t<ranges::range_reference_t<_Rp>>, _Rp>)
-    return range_format::disabled;
-  // 2.2 Otherwise, if the qualified-id R::key_type is valid and denotes a type:
-  else if constexpr (requires { typename _Rp::key_type; }) {
-    // 2.2.1 If the qualified-id R::mapped_type is valid and denotes a type ...
-    if constexpr (requires { typename _Rp::mapped_type; } &&
-                  // 2.2.1 ... If either U is a specialization of pair or U is a specialization
-                  // of tuple and tuple_size_v<U> == 2
-                  __fmt_pair_like<remove_cvref_t<ranges::range_reference_t<_Rp>>>)
-      return range_format::map;
-    else
-      // 2.2.2 Otherwise format_kind<R> is range_format::set.
-      return range_format::set;
-  } else
-    // 2.3 Otherwise, format_kind<R> is range_format::sequence.
-    return range_format::sequence;
-}();
-
 template <range_format _Kp, ranges::input_range _Rp, class _CharT>
 struct __range_default_formatter;
 
diff --git a/libcxx/include/__format/range_format.h b/libcxx/include/__format/range_format.h
new file mode 100644
index 0000000000000..139cfd92ee32b
--- /dev/null
+++ b/libcxx/include/__format/range_format.h
@@ -0,0 +1,71 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___FORMAT_RANGE_FORMAT_H
+#define _LIBCPP___FORMAT_RANGE_FORMAT_H
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#include <__concepts/same_as.h>
+#include <__config>
+#include <__format/concepts.h>
+#include <__ranges/concepts.h>
+#include <__type_traits/remove_cvref.h>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 23
+
+_LIBCPP_DIAGNOSTIC_PUSH
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow")
+_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wshadow")
+// This shadows map, set, and string.
+enum class range_format { disabled, map, set, sequence, string, debug_string };
+_LIBCPP_DIAGNOSTIC_POP
+
+template <class _Rp>
+constexpr range_format format_kind = [] {
+  // [format.range.fmtkind]/1
+  // A program that instantiates the primary template of format_kind is ill-formed.
+  static_assert(sizeof(_Rp) != sizeof(_Rp), "create a template specialization of format_kind for your type");
+  return range_format::disabled;
+}();
+
+template <ranges::input_range _Rp>
+  requires same_as<_Rp, remove_cvref_t<_Rp>>
+inline constexpr range_format format_kind<_Rp> = [] {
+  // [format.range.fmtkind]/2
+
+  // 2.1 If same_as<remove_cvref_t<ranges::range_reference_t<R>>, R> is true,
+  // Otherwise format_kind<R> is range_format::disabled.
+  if constexpr (same_as<remove_cvref_t<ranges::range_reference_t<_Rp>>, _Rp>)
+    return range_format::disabled;
+  // 2.2 Otherwise, if the qualified-id R::key_type is valid and denotes a type:
+  else if constexpr (requires { typename _Rp::key_type; }) {
+    // 2.2.1 If the qualified-id R::mapped_type is valid and denotes a type ...
+    if constexpr (requires { typename _Rp::mapped_type; } &&
+                  // 2.2.1 ... If either U is a specialization of pair or U is a specialization
+                  // of tuple and tuple_size_v<U> == 2
+                  __fmt_pair_like<remove_cvref_t<ranges::range_reference_t<_Rp>>>)
+      return range_format::map;
+    else
+      // 2.2.2 Otherwise format_kind<R> is range_format::set.
+      return range_format::set;
+  } else
+    // 2.3 Otherwise, format_kind<R> is range_format::sequence.
+    return range_format::sequence;
+}();
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 78f2f3bfd2f4c..03f50d9f3f269 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -122,6 +122,19 @@ struct __get_hash_node_value_type<__hash_value_type<_Key, _Tp> > {
 template <class _Tp>
 using __get_hash_node_value_type_t _LIBCPP_NODEBUG = typename __get_hash_node_value_type<_Tp>::type;
 
+template <class _Tp>
+struct __get_hash_node_key_type {
+  using type _LIBCPP_NODEBUG = _Tp;
+};
+
+template <class _Key, class _Tp>
+struct __get_hash_node_key_type<__hash_value_type<_Key, _Tp> > {
+  using type _LIBCPP_NODEBUG = _Key;
+};
+
+template <class _Tp>
+using __get_hash_node_key_type_t _LIBCPP_NODEBUG = typename __get_hash_node_key_type<_Tp>::type;
+
 template <class _Tp, class _VoidPtr>
 struct __hash_node : public __hash_node_base< __rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > > {
   using __node_value_type _LIBCPP_NODEBUG = __get_hash_node_value_type_t<_Tp>;
@@ -182,69 +195,11 @@ class __hash_map_iterator;
 template <class _HashIterator>
 class __hash_map_const_iterator;
 
-template <class _Tp>
-struct __hash_key_value_types {
-  static_assert(!is_reference<_Tp>::value && !is_const<_Tp>::value, "");
-  typedef _Tp key_type;
-  typedef _Tp __node_value_type;
-  typedef _Tp __container_value_type;
-  static const bool __is_map = false;
-
-  _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(_Tp const& __v) { return __v; }
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(__node_value_type const& __v) { return __v; }
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__node_value_type& __n) { return std::addressof(__n); }
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type&& __move(__node_value_type& __v) { return std::move(__v); }
-};
-
-template <class _Key, class _Tp>
-struct __hash_key_value_types<__hash_value_type<_Key, _Tp> > {
-  typedef _Key key_type;
-  typedef _Tp mapped_type;
-  typedef __hash_value_type<_Key, _Tp> __node_value_type;
-  typedef pair<const _Key, _Tp> __container_value_type;
-  typedef __container_value_type __map_value_type;
-  static const bool __is_map = true;
-
-  _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(__container_value_type const& __v) { return __v.first; }
-
-  template <class _Up, __enable_if_t<is_same<__remove_cvref_t<_Up>, __node_value_type>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(_Up& __t) {
-    return __t.__get_value();
-  }
-
-  template <class _Up, __enable_if_t<is_same<__remove_cvref_t<_Up>, __container_value_type>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(_Up& __t) {
-    return __t;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__container_value_type& __n) {
-    return std::addressof(__n);
-  }
-  _LIBCPP_HIDE_FROM_ABI static pair<key_type&&, mapped_type&&> __move(__node_value_type& __v) { return __v.__move(); }
-};
-
-template <class _Tp, class _AllocPtr, class _KVTypes = __hash_key_value_types<_Tp>, bool = _KVTypes::__is_map>
-struct __hash_map_pointer_types {};
-
-template <class _Tp, class _AllocPtr, class _KVTypes>
-struct __hash_map_pointer_types<_Tp, _AllocPtr, _KVTypes, true> {
-  typedef typename _KVTypes::__map_value_type _Mv;
-  typedef __rebind_pointer_t<_AllocPtr, _Mv> __map_value_type_pointer;
-  typedef __rebind_pointer_t<_AllocPtr, const _Mv> __const_map_value_type_pointer;
-};
-
 template <class _NodePtr, class _NodeT = typename pointer_traits<_NodePtr>::element_type>
 struct __hash_node_types;
 
 template <class _NodePtr, class _Tp, class _VoidPtr>
-struct __hash_node_types<_NodePtr, __hash_node<_Tp, _VoidPtr> >
-    : public __hash_key_value_types<_Tp>,
-      __hash_map_pointer_types<_Tp, _VoidPtr>
-
-{
-  typedef __hash_key_value_types<_Tp> __base;
-
-public:
+struct __hash_node_types<_NodePtr, __hash_node<_Tp, _VoidPtr> > {
   typedef ptrdiff_t difference_type;
   typedef size_t size_type;
 
@@ -617,8 +572,6 @@ public:
   typedef typename __alloc_traits::pointer pointer;
 
 private:
-  typedef __hash_node_types<pointer> _NodeTypes;
-
   allocator_type& __na_;
 
 public:
@@ -633,7 +586,7 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI void operator()(pointer __p) _NOEXCEPT {
     if (__value_constructed) {
-      __alloc_traits::destroy(__na_, _NodeTypes::__get_ptr(__p->__get_value()));
+      __alloc_traits::destroy(__na_, std::addressof(__p->__get_value()));
       std::__destroy_at(std::addressof(*__p));
     }
     if (__p)
@@ -684,6 +637,8 @@ template <class _Tp, class _Hash, class _Equal, class _Alloc>
 class __hash_table {
 public:
   using value_type = __get_hash_node_value_type_t<_Tp>;
+  using key_type   = __get_hash_node_key_type_t<_Tp>;
+
   typedef _Hash hasher;
   typedef _Equal key_equal;
   typedef _Alloc allocator_type;
@@ -694,8 +649,6 @@ private:
 
 public:
   typedef typename _NodeTypes::__node_value_type __node_value_type;
-  typedef typename _NodeTypes::__container_value_type __container_value_type;
-  typedef typename _NodeTypes::key_type key_type;
   typedef value_type& reference;
   typedef const value_type& const_reference;
   typedef typename __alloc_traits::pointer pointer;
@@ -824,7 +777,7 @@ public:
 
   template <class _First,
             class _Second,
-            __enable_if_t<__can_extract_map_key<_First, key_type, __container_value_type>::value, int> = 0>
+            __enable_if_t<__can_extract_map_key<_First, key_type, value_type>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __emplace_unique(_First&& __f, _Second&& __s) {
     return __emplace_unique_key_args(__f, std::forward<_First>(__f), std::forward<_Second>(__s));
   }
@@ -854,9 +807,7 @@ public:
 
   template <class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void __insert_unique_from_orphaned_node(value_type&& __value) {
-    using __key_type = typename _NodeTypes::key_type;
-
-    __node_holder __h = __construct_node(const_cast<__key_type&&>(__value.first), std::move(__value.second));
+    __node_holder __h = __construct_node(const_cast<key_type&&>(__value.first), std::move(__value.second));
     __node_insert_unique(__h.get());
     __h.release();
   }
@@ -870,9 +821,7 @@ public:
 
   template <class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(value_type&& __value) {
-    using __key_type = typename _NodeTypes::key_type;
-
-    __node_holder __h = __construct_node(const_cast<__key_type&&>(__value.first), std::move(__value.second));
+    __node_holder __h = __construct_node(const_cast<key_type&&>(__value.first), std::move(__value.second));
     __node_insert_multi(__h.get());
     __h.release();
   }
@@ -1047,12 +996,10 @@ private:
 
   template <class _From, class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI void __assign_value(__get_hash_node_value_type_t<_Tp>& __lhs, _From&& __rhs) {
-    using __key_type = typename _NodeTypes::key_type;
-
     // This is technically UB, since the object was constructed as `const`.
     // Clang doesn't optimize on this currently though.
-    const_cast<__key_type&>(__lhs.first) = const_cast<__copy_cvref_t<_From, __key_type>&&>(__rhs.first);
-    __lhs.second                         = std::forward<_From>(__rhs).second;
+    const_cast<key_type&>(__lhs.first) = const_cast<__copy_cvref_t<_From, key_type>&&>(__rhs.first);
+    __lhs.second                       = std::forward<_From>(__rhs).second;
   }
 
   template <class _From, class _ValueT = _Tp, __enable_if_t<!__is_hash_value_type<_ValueT>::value, int> = 0>
@@ -1201,7 +1148,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer
   while (__np != nullptr) {
     __next_pointer __next    = __np->__next_;
     __node_pointer __real_np = __np->__upcast();
-    __node_traits::destroy(__na, _NodeTypes::__get_ptr(__real_np->__get_value()));
+    __node_traits::destroy(__na, std::addressof(__real_np->__get_value()));
     std::__destroy_at(std::addressof(*__real_np));
     __node_traits::deallocate(__na, __real_np, 1);
     __np = __next;
@@ -1290,8 +1237,8 @@ template <class _InputIterator>
 void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_unique(_InputIterator __first, _InputIterator __last) {
   typedef iterator_traits<_InputIterator> _ITraits;
   typedef typename _ITraits::value_type _ItValueType;
-  static_assert(is_same<_ItValueType, __container_value_type>::value,
-                "__assign_unique may only be called with the containers value type");
+  static_assert(
+      is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type");
 
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
@@ -1321,10 +1268,8 @@ template <class _InputIterator>
 void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __first, _InputIterator __last) {
   typedef iterator_traits<_InputIterator> _ITraits;
   typedef typename _ITraits::value_type _ItValueType;
-  static_assert(
-      (is_same<_ItValueType, __container_value_type>::value || is_same<_ItValueType, __node_value_type>::value),
-      "__assign_multi may only be called with the containers value type"
-      " or the nodes value type");
+  static_assert(is_same<_ItValueType, value_type>::value,
+                "__assign_multi may only be called with the containers value type or the nodes value type");
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
 #if _LIBCPP_HAS_EXCEPTIONS
@@ -1345,7 +1290,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __f
     __deallocate_node(__cache);
   }
   for (; __first != __last; ++__first)
-    __emplace_multi(_NodeTypes::__get_value(*__first));
+    __emplace_multi(*__first);
 }
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
@@ -1863,7 +1808,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node(_Args&&... __args) {
   std::__construct_at(std::addressof(*__h), /* next = */ nullptr, /* hash = */ 0);
 
   // Now construct the value_type using the allocator's construct() method.
-  __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__get_value()), std::forward<_Args>(__args)...);
+  __node_traits::construct(__na, std::addressof(__h->__get_value()), std::forward<_Args>(__args)...);
   __h.get_deleter().__value_constructed = true;
 
   __h->__hash_ = hash_function()(__h->__get_value());
@@ -1879,7 +1824,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash(size_t __hash, _
   __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na));
   std::__construct_at(std::addressof(*__h), /* next = */ nullptr, /* hash = */ __hash);
   __node_traits::construct(
-      __na, _NodeTypes::__get_ptr(__h->__get_value()), std::forward<_First>(__f), std::forward<_Rest>(__rest)...);
+      __na, std::addressof(__h->__get_value()), std::forward<_First>(__f), std::forward<_Rest>(__rest)...);
   __h.get_deleter().__value_constructed = true;
   return __h;
 }
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index 8dbc28e839839..9f3ce02a3af20 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -129,8 +129,6 @@
 //       will define those directly.
 #    if defined(_AIX) || defined(__MVS__)
 #      include <__locale_dir/locale_base_api/ibm.h>
-#    elif defined(__ANDROID__)
-#      include <__locale_dir/locale_base_api/android.h>
 #    elif defined(__OpenBSD__)
 #      include <__locale_dir/locale_base_api/openbsd.h>
 #    elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC
diff --git a/libcxx/include/__locale_dir/locale_base_api/android.h b/libcxx/include/__locale_dir/locale_base_api/android.h
deleted file mode 100644
index 36b8d93e1b228..0000000000000
--- a/libcxx/include/__locale_dir/locale_base_api/android.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// -*- C++ -*-
-//===-----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
-
-#include <stdlib.h>
-
-// FIXME: Is this actually required?
-extern "C" {
-#include <xlocale.h>
-}
-
-#include <android/api-level.h>
-
-// If we do not have this header, we are in a platform build rather than an NDK
-// build, which will always be at least as new as the ToT NDK, in which case we
-// don't need any of the inlines below since libc provides them.
-#if __has_include(<android/ndk-version.h>)
-#  include <android/ndk-version.h>
-// In NDK versions later than 16, locale-aware functions are provided by
-// legacy_stdlib_inlines.h
-#  if __NDK_MAJOR__ <= 16
-#    if __ANDROID_API__ < 26
-
-inline _LIBCPP_HIDE_FROM_ABI float strtof_l(const char* __nptr, char** __endptr, locale_t) {
-  return ::strtof(__nptr, __endptr);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI double strtod_l(const char* __nptr, char** __endptr, locale_t) {
-  return ::strtod(__nptr, __endptr);
-}
-
-#    endif // __ANDROID_API__ < 26
-
-#  endif // __NDK_MAJOR__ <= 16
-#endif   // __has_include(<android/ndk-version.h>)
-
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index b95c6a37c5c11..1b8711f10811d 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -50,7 +50,9 @@ class _LIBCPP_AVAILABILITY_PMR polymorphic_allocator {
 
   _LIBCPP_HIDE_FROM_ABI polymorphic_allocator() noexcept : __res_(std::pmr::get_default_resource()) {}
 
-  _LIBCPP_HIDE_FROM_ABI polymorphic_allocator(memory_resource* __r) noexcept : __res_(__r) {}
+  _LIBCPP_HIDE_FROM_ABI polymorphic_allocator(memory_resource* _LIBCPP_DIAGNOSE_NULLPTR __r) noexcept : __res_(__r) {
+    _LIBCPP_ASSERT_NON_NULL(__r, "Attempted to pass a nullptr resource to polymorphic_alloator");
+  }
 
   _LIBCPP_HIDE_FROM_ABI polymorphic_allocator(const polymorphic_allocator&) = default;
 
@@ -174,7 +176,7 @@ class _LIBCPP_AVAILABILITY_PMR polymorphic_allocator {
     return polymorphic_allocator();
   }
 
-  _LIBCPP_HIDE_FROM_ABI memory_resource* resource() const noexcept { return __res_; }
+  [[__gnu__::__returns_nonnull__]] _LIBCPP_HIDE_FROM_ABI memory_resource* resource() const noexcept { return __res_; }
 
   _LIBCPP_HIDE_FROM_ABI friend bool
   operator==(const polymorphic_allocator& __lhs, const polymorphic_allocator& __rhs) noexcept {
diff --git a/libcxx/include/__new/allocate.h b/libcxx/include/__new/allocate.h
index 738fa62af4d61..9bfe19aedb79f 100644
--- a/libcxx/include/__new/allocate.h
+++ b/libcxx/include/__new/allocate.h
@@ -31,37 +31,16 @@ _LIBCPP_CONSTEXPR inline _LIBCPP_HIDE_FROM_ABI bool __is_overaligned_for_new(siz
 #endif
 }
 
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args) {
-#if __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete)
-  return __builtin_operator_new(__args...);
-#else
-  return ::operator new(__args...);
-#endif
-}
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) _NOEXCEPT {
-#if __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete)
-  __builtin_operator_delete(__args...);
-#else
-  ::operator delete(__args...);
-#endif
-}
-
 template <class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _Tp*
-__libcpp_allocate(__element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) {
+__libcpp_allocate(__element_count __n, [[__maybe_unused__]] size_t __align = _LIBCPP_ALIGNOF(_Tp)) {
   size_t __size = static_cast<size_t>(__n) * sizeof(_Tp);
 #if _LIBCPP_HAS_ALIGNED_ALLOCATION
-  if (__is_overaligned_for_new(__align)) {
-    const align_val_t __align_val = static_cast<align_val_t>(__align);
-    return static_cast<_Tp*>(std::__libcpp_operator_new(__size, __align_val));
-  }
+  if (__is_overaligned_for_new(__align))
+    return static_cast<_Tp*>(__builtin_operator_new(__size, static_cast<align_val_t>(__align)));
 #endif
 
-  (void)__align;
-  return static_cast<_Tp*>(std::__libcpp_operator_new(__size));
+  return static_cast<_Tp*>(__builtin_operator_new(__size));
 }
 
 #if _LIBCPP_HAS_SIZED_DEALLOCATION
@@ -71,39 +50,29 @@ __libcpp_allocate(__element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) {
 #endif
 
 template <class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(
-    __type_identity_t<_Tp>* __ptr, __element_count __n, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT {
-  size_t __size = static_cast<size_t>(__n) * sizeof(_Tp);
-  (void)__size;
-#if !_LIBCPP_HAS_ALIGNED_ALLOCATION
-  (void)__align;
-  return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size));
-#else
-  if (__is_overaligned_for_new(__align)) {
-    const align_val_t __align_val = static_cast<align_val_t>(__align);
-    return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size), __align_val);
-  } else {
-    return std::__libcpp_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size));
-  }
+inline _LIBCPP_HIDE_FROM_ABI void
+__libcpp_deallocate(__type_identity_t<_Tp>* __ptr,
+                    __element_count __n,
+                    [[__maybe_unused__]] size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT {
+  [[__maybe_unused__]] size_t __size = static_cast<size_t>(__n) * sizeof(_Tp);
+#if _LIBCPP_HAS_ALIGNED_ALLOCATION
+  if (__is_overaligned_for_new(__align))
+    return __builtin_operator_delete(
+        __ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size), static_cast<align_val_t>(__align));
 #endif
+  return __builtin_operator_delete(__ptr _LIBCPP_ONLY_IF_SIZED_DEALLOCATION(, __size));
 }
 
 #undef _LIBCPP_ONLY_IF_SIZED_DEALLOCATION
 
 template <class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI void
-__libcpp_deallocate_unsized(__type_identity_t<_Tp>* __ptr, size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT {
-#if !_LIBCPP_HAS_ALIGNED_ALLOCATION
-  (void)__align;
-  return std::__libcpp_operator_delete(__ptr);
-#else
-  if (__is_overaligned_for_new(__align)) {
-    const align_val_t __align_val = static_cast<align_val_t>(__align);
-    return std::__libcpp_operator_delete(__ptr, __align_val);
-  } else {
-    return std::__libcpp_operator_delete(__ptr);
-  }
+inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(
+    __type_identity_t<_Tp>* __ptr, [[__maybe_unused__]] size_t __align = _LIBCPP_ALIGNOF(_Tp)) _NOEXCEPT {
+#if _LIBCPP_HAS_ALIGNED_ALLOCATION
+  if (__is_overaligned_for_new(__align))
+    return __builtin_operator_delete(__ptr, static_cast<align_val_t>(__align));
 #endif
+  return __builtin_operator_delete(__ptr);
 }
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__new/launder.h b/libcxx/include/__new/launder.h
index 83d80015913d9..886f614eed2e7 100644
--- a/libcxx/include/__new/launder.h
+++ b/libcxx/include/__new/launder.h
@@ -10,8 +10,6 @@
 #define _LIBCPP___NEW_LAUNDER_H
 
 #include <__config>
-#include <__type_traits/is_function.h>
-#include <__type_traits/is_void.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -20,15 +18,15 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 [[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp* __launder(_Tp* __p) _NOEXCEPT {
-  static_assert(!(is_function<_Tp>::value), "can't launder functions");
-  static_assert(!is_void<_Tp>::value, "can't launder cv-void");
+  // The compiler diagnoses misuses of __builtin_launder, so we don't need to add any static_asserts
+  // to implement the Mandates.
   return __builtin_launder(__p);
 }
 
 #if _LIBCPP_STD_VER >= 17
 template <class _Tp>
 [[nodiscard]] inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp* launder(_Tp* __p) noexcept {
-  return std::__launder(__p);
+  return __builtin_launder(__p);
 }
 #endif
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index b3c0ece8e5fdb..f8bb4f01b1e29 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -646,12 +646,7 @@ public:
   using reference         = value_type&;
   using pointer           = __rebind_pointer_t<_NodePtr, value_type>;
 
-  _LIBCPP_HIDE_FROM_ABI __tree_iterator() _NOEXCEPT
-#if _LIBCPP_STD_VER >= 14
-      : __ptr_(nullptr)
-#endif
-  {
-  }
+  _LIBCPP_HIDE_FROM_ABI __tree_iterator() _NOEXCEPT : __ptr_(nullptr) {}
 
   _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_np()->__value_; }
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits<pointer>::pointer_to(__get_np()->__value_); }
@@ -720,12 +715,7 @@ public:
   using reference         = const value_type&;
   using pointer           = __rebind_pointer_t<_NodePtr, const value_type>;
 
-  _LIBCPP_HIDE_FROM_ABI __tree_const_iterator() _NOEXCEPT
-#if _LIBCPP_STD_VER >= 14
-      : __ptr_(nullptr)
-#endif
-  {
-  }
+  _LIBCPP_HIDE_FROM_ABI __tree_const_iterator() _NOEXCEPT : __ptr_(nullptr) {}
 
 private:
   typedef __tree_iterator<_Tp, __node_pointer, difference_type> __non_const_iterator;
@@ -865,17 +855,11 @@ public:
 
 private:
   _LIBCPP_HIDE_FROM_ABI const __node_allocator& __node_alloc() const _NOEXCEPT { return __node_alloc_; }
-  _LIBCPP_HIDE_FROM_ABI __end_node_pointer& __begin_node() _NOEXCEPT { return __begin_node_; }
-  _LIBCPP_HIDE_FROM_ABI const __end_node_pointer& __begin_node() const _NOEXCEPT { return __begin_node_; }
 
 public:
   _LIBCPP_HIDE_FROM_ABI allocator_type __alloc() const _NOEXCEPT { return allocator_type(__node_alloc()); }
 
-private:
-  _LIBCPP_HIDE_FROM_ABI size_type& size() _NOEXCEPT { return __size_; }
-
-public:
-  _LIBCPP_HIDE_FROM_ABI const size_type& size() const _NOEXCEPT { return __size_; }
+  _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; }
   _LIBCPP_HIDE_FROM_ABI value_compare& value_comp() _NOEXCEPT { return __value_comp_; }
   _LIBCPP_HIDE_FROM_ABI const value_compare& value_comp() const _NOEXCEPT { return __value_comp_; }
 
@@ -912,8 +896,8 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI ~__tree();
 
-  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__begin_node()); }
+  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node_); }
+  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__begin_node_); }
   _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(__end_node()); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(__end_node()); }
 
@@ -1235,30 +1219,30 @@ template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp) _NOEXCEPT_(
     is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_copy_constructible<value_compare>::value)
     : __size_(0), __value_comp_(__comp) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(const allocator_type& __a)
     : __begin_node_(), __node_alloc_(__node_allocator(__a)), __size_(0) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
 template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp, const allocator_type& __a)
     : __begin_node_(), __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(__comp) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
-// Precondition:  size() != 0
+// Precondition:  __size_ != 0
 template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::__node_pointer
 __tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_from_tree(__tree* __t) _NOEXCEPT {
-  __node_pointer __cache                = static_cast<__node_pointer>(__t->__begin_node());
-  __t->__begin_node()                   = __t->__end_node();
+  __node_pointer __cache                = static_cast<__node_pointer>(__t->__begin_node_);
+  __t->__begin_node_                    = __t->__end_node();
   __t->__end_node()->__left_->__parent_ = nullptr;
   __t->__end_node()->__left_            = nullptr;
-  __t->size()                           = 0;
+  __t->__size_                          = 0;
   // __cache->__left_ == nullptr
   if (__cache->__right_ != nullptr)
     __cache = static_cast<__node_pointer>(__cache->__right_);
@@ -1310,7 +1294,7 @@ void __tree<_Tp, _Compare, _Allocator>::__assign_unique(_ForwardIterator __first
       is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type");
   static_assert(
       __has_forward_iterator_category<_ForwardIterator>::value, "__assign_unique requires a forward iterator");
-  if (size() != 0) {
+  if (__size_ != 0) {
     _DetachedTreeCache __cache(this);
     for (; __cache.__get() != nullptr && __first != __last; ++__first) {
       if (__node_assign_unique(*__first, __cache.__get()).second)
@@ -1328,7 +1312,7 @@ void __tree<_Tp, _Compare, _Allocator>::__assign_multi(_InputIterator __first, _
   typedef typename _ITraits::value_type _ItValueType;
   static_assert(
       is_same<_ItValueType, value_type>::value, "__assign_multi may only be called with the containers value_type");
-  if (size() != 0) {
+  if (__size_ != 0) {
     _DetachedTreeCache __cache(this);
     for (; __cache.__get() && __first != __last; ++__first) {
       __assign_value(__cache.__get()->__value_, *__first);
@@ -1347,7 +1331,7 @@ __tree<_Tp, _Compare, _Allocator>::__tree(const __tree& __t)
       __node_alloc_(__node_traits::select_on_container_copy_construction(__t.__node_alloc())),
       __size_(0),
       __value_comp_(__t.value_comp()) {
-  __begin_node() = __end_node();
+  __begin_node_ = __end_node();
 }
 
 template <class _Tp, class _Compare, class _Allocator>
@@ -1358,13 +1342,13 @@ __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) _NOEXCEPT_(
       __node_alloc_(std::move(__t.__node_alloc_)),
       __size_(__t.__size_),
       __value_comp_(std::move(__t.__value_comp_)) {
-  if (size() == 0)
-    __begin_node() = __end_node();
+  if (__size_ == 0)
+    __begin_node_ = __end_node();
   else {
     __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-    __t.__begin_node()               = __t.__end_node();
+    __t.__begin_node_                = __t.__end_node();
     __t.__end_node()->__left_        = nullptr;
-    __t.size()                       = 0;
+    __t.__size_                      = 0;
   }
 }
 
@@ -1372,19 +1356,19 @@ template <class _Tp, class _Compare, class _Allocator>
 __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t, const allocator_type& __a)
     : __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(std::move(__t.value_comp())) {
   if (__a == __t.__alloc()) {
-    if (__t.size() == 0)
-      __begin_node() = __end_node();
+    if (__t.__size_ == 0)
+      __begin_node_ = __end_node();
     else {
-      __begin_node()                   = __t.__begin_node();
+      __begin_node_                    = __t.__begin_node_;
       __end_node()->__left_            = __t.__end_node()->__left_;
       __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-      size()                           = __t.size();
-      __t.__begin_node()               = __t.__end_node();
+      __size_                          = __t.__size_;
+      __t.__begin_node_                = __t.__end_node();
       __t.__end_node()->__left_        = nullptr;
-      __t.size()                       = 0;
+      __t.__size_                      = 0;
     }
   } else {
-    __begin_node() = __end_node();
+    __begin_node_ = __end_node();
   }
 }
 
@@ -1397,13 +1381,13 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type)
   __move_assign_alloc(__t);
   __size_       = __t.__size_;
   __value_comp_ = std::move(__t.__value_comp_);
-  if (size() == 0)
-    __begin_node() = __end_node();
+  if (__size_ == 0)
+    __begin_node_ = __end_node();
   else {
     __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node());
-    __t.__begin_node()               = __t.__end_node();
+    __t.__begin_node_                = __t.__end_node();
     __t.__end_node()->__left_        = nullptr;
-    __t.size()                       = 0;
+    __t.__size_                      = 0;
   }
 }
 
@@ -1414,15 +1398,15 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) {
   else {
     value_comp()       = std::move(__t.value_comp());
     const_iterator __e = end();
-    if (size() != 0) {
+    if (__size_ != 0) {
       _DetachedTreeCache __cache(this);
-      while (__cache.__get() != nullptr && __t.size() != 0) {
+      while (__cache.__get() != nullptr && __t.__size_ != 0) {
         __assign_value(__cache.__get()->__value_, std::move(__t.remove(__t.begin())->__value_));
         __node_insert_multi(__cache.__get());
         __cache.__advance();
       }
     }
-    while (__t.size() != 0) {
+    while (__t.__size_ != 0) {
       __insert_multi_from_orphaned_node(__e, std::move(__t.remove(__t.begin())->__value_));
     }
   }
@@ -1470,12 +1454,12 @@ void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t)
   std::__swap_allocator(__node_alloc(), __t.__node_alloc());
   swap(__size_, __t.__size_);
   swap(__value_comp_, __t.__value_comp_);
-  if (size() == 0)
-    __begin_node() = __end_node();
+  if (__size_ == 0)
+    __begin_node_ = __end_node();
   else
     __end_node()->__left_->__parent_ = __end_node();
-  if (__t.size() == 0)
-    __t.__begin_node() = __t.__end_node();
+  if (__t.__size_ == 0)
+    __t.__begin_node_ = __t.__end_node();
   else
     __t.__end_node()->__left_->__parent_ = __t.__end_node();
 }
@@ -1483,8 +1467,8 @@ void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t)
 template <class _Tp, class _Compare, class _Allocator>
 void __tree<_Tp, _Compare, _Allocator>::clear() _NOEXCEPT {
   destroy(__root());
-  size()                = 0;
-  __begin_node()        = __end_node();
+  __size_               = 0;
+  __begin_node_         = __end_node();
   __end_node()->__left_ = nullptr;
 }
 
@@ -1674,10 +1658,10 @@ void __tree<_Tp, _Compare, _Allocator>::__insert_node_at(
   __new_node->__parent_ = __parent;
   // __new_node->__is_black_ is initialized in __tree_balance_after_insert
   __child = __new_node;
-  if (__begin_node()->__left_ != nullptr)
-    __begin_node() = static_cast<__end_node_pointer>(__begin_node()->__left_);
+  if (__begin_node_->__left_ != nullptr)
+    __begin_node_ = static_cast<__end_node_pointer>(__begin_node_->__left_);
   std::__tree_balance_after_insert(__end_node()->__left_, __child);
-  ++size();
+  ++__size_;
 }
 
 template <class _Tp, class _Compare, class _Allocator>
@@ -1821,9 +1805,9 @@ typename __tree<_Tp, _Compare, _Allocator>::iterator
 __tree<_Tp, _Compare, _Allocator>::__remove_node_pointer(__node_pointer __ptr) _NOEXCEPT {
   iterator __r(__ptr);
   ++__r;
-  if (__begin_node() == __ptr)
-    __begin_node() = __r.__ptr_;
-  --size();
+  if (__begin_node_ == __ptr)
+    __begin_node_ = __r.__ptr_;
+  --__size_;
   std::__tree_remove(__end_node()->__left_, static_cast<__node_base_pointer>(__ptr));
   return __r;
 }
@@ -2187,13 +2171,13 @@ template <class _Tp, class _Compare, class _Allocator>
 typename __tree<_Tp, _Compare, _Allocator>::__node_holder
 __tree<_Tp, _Compare, _Allocator>::remove(const_iterator __p) _NOEXCEPT {
   __node_pointer __np = __p.__get_np();
-  if (__begin_node() == __p.__ptr_) {
+  if (__begin_node_ == __p.__ptr_) {
     if (__np->__right_ != nullptr)
-      __begin_node() = static_cast<__end_node_pointer>(__np->__right_);
+      __begin_node_ = static_cast<__end_node_pointer>(__np->__right_);
     else
-      __begin_node() = static_cast<__end_node_pointer>(__np->__parent_);
+      __begin_node_ = static_cast<__end_node_pointer>(__np->__parent_);
   }
-  --size();
+  --__size_;
   std::__tree_remove(__end_node()->__left_, static_cast<__node_base_pointer>(__np));
   return __node_holder(__np, _Dp(__node_alloc(), true));
 }
diff --git a/libcxx/include/__type_traits/is_core_convertible.h b/libcxx/include/__type_traits/is_core_convertible.h
index 0b8e94e101db1..b1f2a5cc7d684 100644
--- a/libcxx/include/__type_traits/is_core_convertible.h
+++ b/libcxx/include/__type_traits/is_core_convertible.h
@@ -24,20 +24,23 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // and __is_core_convertible<immovable-type,immovable-type> is true in C++17 and later.
 
 template <class _Tp, class _Up, class = void>
-struct __is_core_convertible : false_type {};
+inline const bool __is_core_convertible_v = false;
 
 template <class _Tp, class _Up>
-struct __is_core_convertible<_Tp, _Up, decltype(static_cast<void (*)(_Up)>(0)(static_cast<_Tp (*)()>(0)()))>
-    : true_type {};
+inline const bool
+    __is_core_convertible_v<_Tp, _Up, decltype(static_cast<void (*)(_Up)>(0)(static_cast<_Tp (*)()>(0)()))> = true;
+
+template <class _Tp, class _Up>
+using __is_core_convertible _LIBCPP_NODEBUG = integral_constant<bool, __is_core_convertible_v<_Tp, _Up> >;
 
 #if _LIBCPP_STD_VER >= 20
 
 template <class _Tp, class _Up>
-concept __core_convertible_to = __is_core_convertible<_Tp, _Up>::value;
+concept __core_convertible_to = __is_core_convertible_v<_Tp, _Up>;
 
 #endif // _LIBCPP_STD_VER >= 20
 
-template <class _Tp, class _Up, bool = __is_core_convertible<_Tp, _Up>::value>
+template <class _Tp, class _Up, bool = __is_core_convertible_v<_Tp, _Up> >
 inline const bool __is_nothrow_core_convertible_v = false;
 
 #ifndef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index 00e196963f11c..fb40757a73fb6 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -97,19 +97,21 @@ using __barrier_phase_t _LIBCPP_NODEBUG = uint8_t;
 
 class __barrier_algorithm_base;
 
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI __barrier_algorithm_base*
-__construct_barrier_algorithm_base(ptrdiff_t& __expected);
+[[__gnu__::__returns_nonnull__, __gnu__::__malloc__]] _LIBCPP_AVAILABILITY_SYNC
+    _LIBCPP_EXPORTED_FROM_ABI __barrier_algorithm_base*
+    __construct_barrier_algorithm_base(ptrdiff_t& __expected);
 
 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI bool
-__arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier, __barrier_phase_t __old_phase) noexcept;
+__arrive_barrier_algorithm_base([[__gnu__::__nonnull__]] _LIBCPP_NOESCAPE __barrier_algorithm_base* __barrier,
+                                __barrier_phase_t __old_phase) noexcept;
 
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void
-__destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier) noexcept;
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void __destroy_barrier_algorithm_base(
+    [[__gnu__::__nonnull__]] _LIBCPP_NOESCAPE __barrier_algorithm_base* __barrier) noexcept;
 
 template <class _CompletionF>
 class __barrier_base {
   ptrdiff_t __expected_;
-  unique_ptr<__barrier_algorithm_base, void (*)(__barrier_algorithm_base*)> __base_;
+  unique_ptr<__barrier_algorithm_base, void (*)(_LIBCPP_NOESCAPE __barrier_algorithm_base*)> __base_;
   atomic<ptrdiff_t> __expected_adjustment_;
   _CompletionF __completion_;
   atomic<__barrier_phase_t> __phase_;
diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map
index d6b92204f4376..46815eaffa8bd 100644
--- a/libcxx/include/ext/hash_map
+++ b/libcxx/include/ext/hash_map
@@ -744,7 +744,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -831,7 +831,7 @@ template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/ext/hash_set b/libcxx/include/ext/hash_set
index 7fd5df24ed3a8..62a7a0dbcffb9 100644
--- a/libcxx/include/ext/hash_set
+++ b/libcxx/include/ext/hash_set
@@ -458,7 +458,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -543,7 +543,7 @@ template <class _Value, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multiset<_Value, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Value, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index c86f709bedb80..6d3f20fff688f 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -401,6 +401,14 @@ private:
       }
     }
   }
+
+  _LIBCPP_HIDE_FROM_ABI typename traits_type::int_type __overflow_failed() {
+    if (this->pptr() == this->epptr() + 1) {
+      this->pbump(-1); // lose the character we overflowed above -- we don't really have a
+                       // choice since we couldn't commit the contents of the put area
+    }
+    return traits_type::eof();
+  }
 };
 
 template <class _CharT, class _Traits>
@@ -841,8 +849,9 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
 
   if (__always_noconv_) {
     size_t __n = static_cast<size_t>(this->pptr() - this->pbase());
-    if (std::fwrite(this->pbase(), sizeof(char_type), __n, __file_) != __n)
-      return traits_type::eof();
+    if (std::fwrite(this->pbase(), sizeof(char_type), __n, __file_) != __n) {
+      return __overflow_failed();
+    }
   } else {
     if (!__cv_)
       std::__throw_bad_cast();
@@ -854,34 +863,38 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
     char* __extbuf_end = __extbuf_;
     do {
       codecvt_base::result __r = __cv_->out(__st_, __b, __p, __end, __extbuf_, __extbuf_ + __ebs_, __extbuf_end);
-      if (__end == __b)
-        return traits_type::eof();
+      if (__end == __b) {
+        return __overflow_failed();
+      }
 
       // No conversion needed: output characters directly to the file, done.
       if (__r == codecvt_base::noconv) {
         size_t __n = static_cast<size_t>(__p - __b);
-        if (std::fwrite(__b, 1, __n, __file_) != __n)
-          return traits_type::eof();
+        if (std::fwrite(__b, 1, __n, __file_) != __n) {
+          return __overflow_failed();
+        }
         break;
 
         // Conversion successful: output the converted characters to the file, done.
       } else if (__r == codecvt_base::ok) {
         size_t __n = static_cast<size_t>(__extbuf_end - __extbuf_);
-        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n)
-          return traits_type::eof();
+        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) {
+          return __overflow_failed();
+        }
         break;
 
         // Conversion partially successful: output converted characters to the file and repeat with the
         // remaining characters.
       } else if (__r == codecvt_base::partial) {
         size_t __n = static_cast<size_t>(__extbuf_end - __extbuf_);
-        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n)
-          return traits_type::eof();
+        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) {
+          return __overflow_failed();
+        }
         __b = const_cast<char_type*>(__end);
         continue;
 
       } else {
-        return traits_type::eof();
+        return __overflow_failed();
       }
     } while (true);
   }
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 61ba1c381b2b3..78607f2c1301d 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1369,6 +1369,7 @@ module std [system] {
     module indic_conjunct_break_table         { header "__format/indic_conjunct_break_table.h" }
     module parser_std_format_spec             { header "__format/parser_std_format_spec.h" }
     module range_default_formatter            { header "__format/range_default_formatter.h" }
+    module range_format                       { header "__format/range_format.h" }
     module range_formatter                    { header "__format/range_formatter.h" }
     module unicode                            { header "__format/unicode.h" }
     module width_estimation_table             { header "__format/width_estimation_table.h" }
@@ -1590,7 +1591,6 @@ module std [system] {
     }
 
     module locale_base_api {
-      textual header "__locale_dir/locale_base_api/android.h"
       textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h"
       textual header "__locale_dir/locale_base_api/ibm.h"
       textual header "__locale_dir/locale_base_api/musl.h"
diff --git a/libcxx/include/optional b/libcxx/include/optional
index fa32d75ef86dd..e81bff50daad6 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -205,6 +205,7 @@ namespace std {
 #  include <__type_traits/is_assignable.h>
 #  include <__type_traits/is_constructible.h>
 #  include <__type_traits/is_convertible.h>
+#  include <__type_traits/is_core_convertible.h>
 #  include <__type_traits/is_destructible.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
@@ -982,11 +983,13 @@ public:
 template <class _Tp>
 optional(_Tp) -> optional<_Tp>;
 
-// Comparisons between optionals
+// [optional.relops] Relational operators
+
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (static_cast<bool>(__x) != static_cast<bool>(__y))
     return false;
@@ -998,7 +1001,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (static_cast<bool>(__x) != static_cast<bool>(__y))
     return true;
@@ -1007,10 +1011,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const
   return *__x != *__y;
 }
 
-template <
-    class _Tp,
-    class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>, int> = 0>
+template < class _Tp,
+           class _Up,
+           enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>,
+                       int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__y))
     return false;
@@ -1019,10 +1023,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>& __x, const o
   return *__x < *__y;
 }
 
-template <
-    class _Tp,
-    class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>, int> = 0>
+template < class _Tp,
+           class _Up,
+           enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>,
+                       int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__x))
     return false;
@@ -1034,7 +1038,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, const o
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__x))
     return true;
@@ -1046,7 +1051,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>& __x, const optional<_Up>& __y) {
   if (!static_cast<bool>(__y))
     return true;
@@ -1067,7 +1073,8 @@ operator<=>(const optional<_Tp>& __x, const optional<_Up>& __y) {
 
 #    endif // _LIBCPP_STD_VER >= 20
 
-// Comparisons with nullopt
+// [optional.nullops] Comparison with nullopt
+
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, nullopt_t) noexcept {
   return !static_cast<bool>(__x);
@@ -1139,11 +1146,13 @@ _LIBCPP_HIDE_FROM_ABI constexpr strong_ordering operator<=>(const optional<_Tp>&
 
 #    endif // _LIBCPP_STD_VER <= 17
 
-// Comparisons with T
+// [optional.comp.with.t] Comparison with T
+
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x == __v : false;
 }
@@ -1151,7 +1160,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const optional<_Tp>& __x, const
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() == std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v == *__x : false;
 }
@@ -1159,7 +1169,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator==(const _Tp& __v, const optional<_
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x != __v : true;
 }
@@ -1167,23 +1178,24 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const optional<_Tp>& __x, const
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() != std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator!=(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v != *__x : true;
 }
 
-template <
-    class _Tp,
-    class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>, int> = 0>
+template < class _Tp,
+           class _Up,
+           enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>,
+                       int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x < __v : true;
 }
 
-template <
-    class _Tp,
-    class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>, int> = 0>
+template < class _Tp,
+           class _Up,
+           enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() < std::declval<const _Up&>()), bool>,
+                       int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v < *__x : false;
 }
@@ -1191,7 +1203,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator<(const _Tp& __v, const optional<_U
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x <= __v : true;
 }
@@ -1199,23 +1212,24 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const optional<_Tp>& __x, const
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() <= std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator<=(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v <= *__x : false;
 }
 
-template <
-    class _Tp,
-    class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>, int> = 0>
+template < class _Tp,
+           class _Up,
+           enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>,
+                       int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x > __v : false;
 }
 
-template <
-    class _Tp,
-    class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>, int> = 0>
+template < class _Tp,
+           class _Up,
+           enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() > std::declval<const _Up&>()), bool>,
+                       int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v > *__x : true;
 }
@@ -1223,7 +1237,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator>(const _Tp& __v, const optional<_U
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>& __x, const _Up& __v) {
   return static_cast<bool>(__x) ? *__x >= __v : false;
 }
@@ -1231,7 +1246,8 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const optional<_Tp>& __x, const
 template <
     class _Tp,
     class _Up,
-    enable_if_t<is_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>, int> = 0>
+    enable_if_t<__is_core_convertible_v<decltype(std::declval<const _Tp&>() >= std::declval<const _Up&>()), bool>,
+                int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr bool operator>=(const _Tp& __v, const optional<_Up>& __x) {
   return static_cast<bool>(__x) ? __v >= *__x : true;
 }
diff --git a/libcxx/include/string b/libcxx/include/string
index 514dd91c7c172..788af36d67c58 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -700,18 +700,18 @@ __concatenate_strings(const _Allocator& __alloc,
                       __type_identity_t<basic_string_view<_CharT, _Traits> > __str2);
 
 template <class _Iter>
-struct __string_is_trivial_iterator : public false_type {};
+inline const bool __string_is_trivial_iterator_v = false;
 
 template <class _Tp>
-struct __string_is_trivial_iterator<_Tp*> : public is_arithmetic<_Tp> {};
+inline const bool __string_is_trivial_iterator_v<_Tp*> = is_arithmetic<_Tp>::value;
 
 template <class _Iter>
-struct __string_is_trivial_iterator<__wrap_iter<_Iter> > : public __string_is_trivial_iterator<_Iter> {};
+inline const bool __string_is_trivial_iterator_v<__wrap_iter<_Iter> > = __string_is_trivial_iterator_v<_Iter>;
 
 template <class _CharT, class _Traits, class _Tp>
-struct __can_be_converted_to_string_view
-    : public _BoolConstant< is_convertible<const _Tp&, basic_string_view<_CharT, _Traits> >::value &&
-                            !is_convertible<const _Tp&, const _CharT*>::value > {};
+inline const bool __can_be_converted_to_string_view_v =
+    is_convertible<const _Tp&, basic_string_view<_CharT, _Traits> >::value &&
+    !is_convertible<const _Tp&, const _CharT*>::value;
 
 struct __uninitialized_size_tag {};
 struct __init_with_sentinel_tag {};
@@ -1125,7 +1125,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
@@ -1137,7 +1137,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t) {
@@ -1146,7 +1146,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t, const allocator_type& __a)
@@ -1205,7 +1205,7 @@ public:
   operator=(const basic_string& __str);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(const _Tp& __t) {
@@ -1342,7 +1342,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string >::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const _Tp& __t) {
@@ -1371,7 +1371,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const _Tp& __t) {
@@ -1382,7 +1382,7 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const basic_string& __str, size_type __pos, size_type __n = npos);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1415,7 +1415,7 @@ public:
     size_type __cap = capacity();
     size_type __n   = static_cast<size_type>(std::distance(__first, __last));
     if (__n) {
-      if (__string_is_trivial_iterator<_ForwardIterator>::value && !__addr_in_range(*__first)) {
+      if (__string_is_trivial_iterator_v<_ForwardIterator> && !__addr_in_range(*__first)) {
         if (__cap - __sz < __n)
           __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0);
         __annotate_increase(__n);
@@ -1467,7 +1467,7 @@ public:
     return *(data() + size() - 1);
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const _Tp& __t) {
     __self_view __sv = __t;
     return assign(__sv.data(), __sv.size());
@@ -1509,7 +1509,7 @@ public:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const basic_string& __str, size_type __pos, size_type __n = npos);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1535,7 +1535,7 @@ public:
   template <class _ForwardIterator, __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   assign(_ForwardIterator __first, _ForwardIterator __last) {
-    if (__string_is_trivial_iterator<_ForwardIterator>::value) {
+    if (__string_is_trivial_iterator_v<_ForwardIterator>) {
       size_type __n = static_cast<size_type>(std::distance(__first, __last));
       __assign_trivial(__first, __last, __n);
     } else {
@@ -1548,7 +1548,7 @@ public:
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_CharT> _Range>
   _LIBCPP_HIDE_FROM_ABI constexpr basic_string& assign_range(_Range&& __range) {
-    if constexpr (__string_is_trivial_iterator<ranges::iterator_t<_Range>>::value &&
+    if constexpr (__string_is_trivial_iterator_v<ranges::iterator_t<_Range>> &&
                   (ranges::forward_range<_Range> || ranges::sized_range<_Range>)) {
       size_type __n = static_cast<size_type>(ranges::distance(__range));
       __assign_trivial(ranges::begin(__range), ranges::end(__range), __n);
@@ -1572,14 +1572,14 @@ public:
     return insert(__pos1, __str.data(), __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos1, const _Tp& __t) {
     __self_view __sv = __t;
     return insert(__pos1, __sv.data(), __sv.size());
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1649,7 +1649,7 @@ public:
     return replace(__pos1, __n1, __str.data(), __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   replace(size_type __pos1, size_type __n1, const _Tp& __t) {
     __self_view __sv = __t;
@@ -1660,7 +1660,7 @@ public:
   replace(size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos);
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
@@ -1683,7 +1683,7 @@ public:
         static_cast<size_type>(__i1 - begin()), static_cast<size_type>(__i2 - __i1), __str.data(), __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string&
   replace(const_iterator __i1, const_iterator __i2, const _Tp& __t) {
     __self_view __sv = __t;
@@ -1776,7 +1776,7 @@ public:
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1807,7 +1807,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1838,7 +1838,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1872,7 +1872,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1906,7 +1906,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_first_not_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1940,7 +1940,7 @@ public:
         data(), size(), __str.data(), __pos, __str.size());
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
   find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT {
     __self_view __sv = __t;
@@ -1972,7 +1972,7 @@ public:
     return compare(__self_view(__str));
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT {
     __self_view __sv = __t;
     size_t __lhs_sz  = size();
@@ -1987,7 +1987,7 @@ public:
     return 0;
   }
 
-  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value, int> = 0>
+  template <class _Tp, __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
   compare(size_type __pos1, size_type __n1, const _Tp& __t) const {
     __self_view __sv = __t;
@@ -2005,7 +2005,7 @@ public:
   }
 
   template <class _Tp,
-            __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value &&
+            __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> &&
                               !is_same<__remove_cvref_t<_Tp>, basic_string>::value,
                           int> = 0>
   inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int
@@ -2951,7 +2951,7 @@ template <class _Iterator, class _Sentinel>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
 basic_string<_CharT, _Traits, _Allocator>::__assign_trivial(_Iterator __first, _Sentinel __last, size_type __n) {
   _LIBCPP_ASSERT_INTERNAL(
-      __string_is_trivial_iterator<_Iterator>::value, "The iterator type given to `__assign_trivial` must be trivial");
+      __string_is_trivial_iterator_v<_Iterator>, "The iterator type given to `__assign_trivial` must be trivial");
 
   size_type __old_size = size();
   size_type __cap      = capacity();
@@ -3166,7 +3166,7 @@ basic_string<_CharT, _Traits, _Allocator>::__insert_with_size(
   if (__n == 0)
     return begin() + __ip;
 
-  if (__string_is_trivial_iterator<_Iterator>::value && !__addr_in_range(*__first)) {
+  if (__string_is_trivial_iterator_v<_Iterator> && !__addr_in_range(*__first)) {
     return __insert_from_safe_copy(__n, __ip, std::move(__first), std::move(__last));
   } else {
     const basic_string __temp(__init_with_sentinel_tag(), std::move(__first), std::move(__last), __alloc_);
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index 5b70cdeae11a5..97c2c52eba337 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -844,10 +844,10 @@ class __hash_map_iterator {
 
 public:
   typedef forward_iterator_tag iterator_category;
-  typedef typename _NodeTypes::__map_value_type value_type;
+  using value_type = typename _HashIterator::value_type;
   typedef typename _NodeTypes::difference_type difference_type;
   typedef value_type& reference;
-  typedef typename _NodeTypes::__map_value_type_pointer pointer;
+  using pointer = typename _HashIterator::pointer;
 
   _LIBCPP_HIDE_FROM_ABI __hash_map_iterator() _NOEXCEPT {}
 
@@ -895,10 +895,10 @@ class __hash_map_const_iterator {
 
 public:
   typedef forward_iterator_tag iterator_category;
-  typedef typename _NodeTypes::__map_value_type value_type;
+  using value_type = typename _HashIterator::value_type;
   typedef typename _NodeTypes::difference_type difference_type;
   typedef const value_type& reference;
-  typedef typename _NodeTypes::__const_map_value_type_pointer pointer;
+  using pointer = typename _HashIterator::pointer;
 
   _LIBCPP_HIDE_FROM_ABI __hash_map_const_iterator() _NOEXCEPT {}
 
diff --git a/libcxx/src/barrier.cpp b/libcxx/src/barrier.cpp
index 868f1bfbaffc2..72c29b49ada39 100644
--- a/libcxx/src/barrier.cpp
+++ b/libcxx/src/barrier.cpp
@@ -60,11 +60,12 @@ class __barrier_algorithm_base {
 _LIBCPP_EXPORTED_FROM_ABI __barrier_algorithm_base* __construct_barrier_algorithm_base(ptrdiff_t& __expected) {
   return new __barrier_algorithm_base(__expected);
 }
-_LIBCPP_EXPORTED_FROM_ABI bool
-__arrive_barrier_algorithm_base(__barrier_algorithm_base* __barrier, __barrier_phase_t __old_phase) noexcept {
+_LIBCPP_EXPORTED_FROM_ABI bool __arrive_barrier_algorithm_base(
+    _LIBCPP_NOESCAPE __barrier_algorithm_base* __barrier, __barrier_phase_t __old_phase) noexcept {
   return __barrier->__arrive(__old_phase);
 }
-_LIBCPP_EXPORTED_FROM_ABI void __destroy_barrier_algorithm_base(__barrier_algorithm_base* __barrier) noexcept {
+_LIBCPP_EXPORTED_FROM_ABI void
+__destroy_barrier_algorithm_base(_LIBCPP_NOESCAPE __barrier_algorithm_base* __barrier) noexcept {
   delete __barrier;
 }
 
diff --git a/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
new file mode 100644
index 0000000000000..ea80359f1fea2
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_map>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multimap<int, int> map;
+
+  map.insert(std::make_pair(1, 1));
+  map.insert(std::make_pair(1, 1));
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  std::pair<int, int> arr[] = {std::make_pair(1, 1), std::make_pair(1, 1)};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
new file mode 100644
index 0000000000000..1a60cac158a40
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_set>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multiset<int> map;
+
+  map.insert(1);
+  map.insert(1);
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  int arr[] = {1, 1};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/extensions/libcxx/always_initialize_iterators.pass.cpp b/libcxx/test/extensions/libcxx/always_initialize_iterators.pass.cpp
new file mode 100644
index 0000000000000..1893ca913ad01
--- /dev/null
+++ b/libcxx/test/extensions/libcxx/always_initialize_iterators.pass.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Make sure that that the iterator types of the associative containers is initialized even when default initialized.
+// This is an extension because the standard only requires initialization when an iterator is value initialized, and
+// that only since C++14. We guarantee that iterators are always initialized, even in C++11.
+
+#include <cassert>
+#include <map>
+#include <set>
+
+template <class Iter>
+void test() {
+  {
+    Iter iter1;
+    Iter iter2;
+    assert(iter1 == iter2);
+  }
+  {
+    Iter iter1;
+    Iter iter2{};
+    assert(iter1 == iter2);
+  }
+  {
+    Iter iter1{};
+    Iter iter2;
+    assert(iter1 == iter2);
+  }
+}
+
+template <class Container>
+void test_container() {
+  test<typename Container::iterator>();
+  test<typename Container::const_iterator>();
+  test<typename Container::reverse_iterator>();
+  test<typename Container::const_reverse_iterator>();
+}
+
+int main(int, char**) {
+  test_container<std::map<int, int>>();
+  test_container<std::multimap<int, int>>();
+  test_container<std::set<int>>();
+  test_container<std::multiset<int>>();
+
+  return 0;
+}
diff --git a/libcxx/test/extensions/libcxx/localization/lit.local.cfg b/libcxx/test/extensions/libcxx/localization/lit.local.cfg
new file mode 100644
index 0000000000000..d47f3e0fe4752
--- /dev/null
+++ b/libcxx/test/extensions/libcxx/localization/lit.local.cfg
@@ -0,0 +1,3 @@
+# <locale> tests are obviously not supported when localization support is disabled
+if "no-localization" in config.available_features:
+    config.unsupported = True
diff --git a/libcxx/test/libcxx/localization/locales/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp b/libcxx/test/extensions/libcxx/localization/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/localization/locales/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp
rename to libcxx/test/extensions/libcxx/localization/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp
diff --git a/libcxx/test/libcxx/algorithms/vectorization.compile.pass.cpp b/libcxx/test/libcxx/algorithms/vectorization.compile.pass.cpp
index 733a147b80cc3..872c49a35dd76 100644
--- a/libcxx/test/libcxx/algorithms/vectorization.compile.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/vectorization.compile.pass.cpp
@@ -17,9 +17,6 @@
 // We don't vectorize algorithms on AIX right now.
 // XFAIL: target={{.+}}-aix{{.*}}
 
-// We don't vectorize on AppleClang 15 since that apparently breaks std::mismatch
-// XFAIL: apple-clang-15
-
 // This test ensures that we enable the vectorization of algorithms on the expected
 // platforms.
 
diff --git a/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp b/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp
deleted file mode 100644
index e00a028489a72..0000000000000
--- a/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
-#include <__hash_table>
-#include <unordered_map>
-#include <unordered_set>
-#include <type_traits>
-
-#include "test_macros.h"
-#include "min_allocator.h"
-
-void testKeyValueTrait() {
-  {
-    typedef int Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, int>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::pair<int, int> Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::pair<const int, int> Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, Tp>::value), "");
-    static_assert(Traits::__is_map == false, "");
-  }
-  {
-    typedef std::__hash_value_type<int, int> Tp;
-    typedef std::__hash_key_value_types<Tp> Traits;
-    static_assert((std::is_same<Traits::key_type, int>::value), "");
-    static_assert((std::is_same<Traits::mapped_type, int>::value), "");
-    static_assert((std::is_same<Traits::__node_value_type, Tp>::value), "");
-    static_assert((std::is_same<Traits::__container_value_type, std::pair<const int, int> >::value), "");
-    static_assert((std::is_same<Traits::__map_value_type, std::pair<const int, int> >::value), "");
-    static_assert(Traits::__is_map == true, "");
-  }
-}
-
-int main(int, char**) {
-  testKeyValueTrait();
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/mem/mem.res/ctor.nullptr.assert.pass.cpp b/libcxx/test/libcxx/mem/mem.res/ctor.nullptr.assert.pass.cpp
new file mode 100644
index 0000000000000..831985f2ce37f
--- /dev/null
+++ b/libcxx/test/libcxx/mem/mem.res/ctor.nullptr.assert.pass.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <memory_resource>
+
+// Test hardening assertions for std::pmr::polymorphic_allocator.
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03, c++11, c++14
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// We're testing nullptr assertions
+// ADDITIONAL_COMPILE_FLAGS: -Wno-nonnull
+
+#include <memory_resource>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::pmr::polymorphic_allocator<int>(nullptr), "Attempted to pass a nullptr resource to polymorphic_alloator");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/mem/mem.res/nonnull.verify.cpp b/libcxx/test/libcxx/mem/mem.res/nonnull.verify.cpp
new file mode 100644
index 0000000000000..c5a6c8a3db3cf
--- /dev/null
+++ b/libcxx/test/libcxx/mem/mem.res/nonnull.verify.cpp
@@ -0,0 +1,17 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// Ensure that passing a nullptr to polymorphic_allocator is diagnosed
+
+#include <memory_resource>
+
+void test() {
+  std::pmr::polymorphic_allocator<int> alloc(nullptr); // expected-warning {{null passed}}
+}
diff --git a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp b/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
deleted file mode 100644
index e0811e02f5c13..0000000000000
--- a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Test the set of C++11 features that Clang provides as an extension in C++03 mode.
-// The language features we expect are:
-//
-// 1. rvalue references (and perfect forwarding)
-// 2. variadic templates
-// 3. alias templates
-// 4. defaulted and deleted functions.
-// 5. default values for non-type template parameters.
-//
-// Some features we don't get and can't be used in extended C++03 mode:
-//
-// 1. noexcept and constexpr
-// 2. Two closing '>' without a space.
-
-#include <type_traits>
-#include <cassert>
-
-// Equals delete and default are allowed in minimal C++03 mode.
-namespace test_eq_delete_and_default {
-void t1() = delete;
-struct T2 {
-  T2() = default;
-  T2(T2 const&) = delete;
-};
-}
-
-namespace alias_templates {
-template <class T>
-using X = T;
-static_assert((std::is_same<X<int>, int>::value), "");
-}
-
-namespace variadics_templates {
-template <class ...Args>
-int t1(Args...) {
-  return sizeof...(Args);
-}
-void test() {
-  assert(t1() == 0);
-  assert(t1(42) == 1);
-  assert(t1(1, 2, 3) == 3);
-}
-}
-
-namespace rvalue_references_move_semantics {
-struct T {
-  T() : moved(0) {}
-  T(T const& other) : moved(other.moved) {}
-  T(T&& other) : moved(other.moved) { ++moved; other.moved = -1; }
-  int moved;
-};
-void f(T o, int expect_moved) { assert(o.moved == expect_moved); }
-void test() {
-  {
-    T t;
-    assert(t.moved == 0);
-    T t2(static_cast<T&&>(t));
-    assert(t2.moved == 1);
-    assert(t.moved == -1);
-  }
-  {
-    T t;
-    f(t, 0);
-    f(static_cast<T&&>(t), 1);
-  }
-}
-}
-
-namespace rvalue_references_perfect_forwarding {
-template <class Expect, class T>
-void f(T&&) {
-  static_assert((std::is_same<Expect, T&&>::value), "");
-}
-void test() {
-  int x = 42;
-  f<int&>(x);
-  f<int&&>(42);
-  f<int&&>(static_cast<int&&>(x));
-}
-}
-
-namespace default_values_for_nttp {
-template <int I = 42>
-void f() { assert(I == 42); }
-void test() {
-  f();
-}
-}
-
-namespace reference_qualified_functions {
-struct T {
-  T() : lvalue_called(0), rvalue_called(0) {}
-  void foo() const & { lvalue_called++; }
-  void foo() && { rvalue_called++; }
-  mutable int lvalue_called;
-  int rvalue_called;
-};
-
-void test() {
-  {
-    T t;
-    t.foo();
-    assert(t.lvalue_called == 1);
-    assert(t.rvalue_called == 0);
-  }
-  {
-    T t;
-    static_cast<T&&>(t).foo();
-    assert(t.lvalue_called == 0);
-    assert(t.rvalue_called == 1);
-  }
-}
-}
-
-int main(int, char**) {
-  variadics_templates::test();
-  rvalue_references_move_semantics::test();
-  rvalue_references_perfect_forwarding::test();
-  default_values_for_nttp::test();
-  reference_qualified_functions::test();
-  return 0;
-}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.ambig/assert.ctor.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.ambig/assert.ctor.pass.cpp
index 73e6bf2846f0e..ee2370f5bce25 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.ambig/assert.ctor.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.ambig/assert.ctor.pass.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
 
 // <chrono>
 
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.nonexist/assert.ctor.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.nonexist/assert.ctor.pass.cpp
index fdd9f79958f98..5d896c34e4ccd 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.nonexist/assert.ctor.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.exception/time.zone.exception.nonexist/assert.ctor.pass.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
 
 // <chrono>
 
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_local.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_local.pass.cpp
index d9ca1c80751cc..eb0ae4cf4b187 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_local.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_local.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys.pass.cpp
index 3a2ff00088676..57b7f8d0f30a0 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys.pass.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
 
 // <chrono>
 
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys_choose.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys_choose.pass.cpp
index 65429345ae794..85ce6019fcd55 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys_choose.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/assert.to_sys_choose.pass.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
 
 // <chrono>
 
diff --git a/libcxx/test/libcxx/vendor/apple/disable-availability.sh.cpp b/libcxx/test/libcxx/vendor/apple/disable-availability.sh.cpp
index 474b3f83c6044..c131f6414ed3d 100644
--- a/libcxx/test/libcxx/vendor/apple/disable-availability.sh.cpp
+++ b/libcxx/test/libcxx/vendor/apple/disable-availability.sh.cpp
@@ -8,10 +8,6 @@
 
 // REQUIRES: stdlib=apple-libc++
 
-// This test is dependent on the code generated by the compiler, and it doesn't
-// work properly with older AppleClangs.
-// UNSUPPORTED: apple-clang-15
-
 // This test ensures that we retain a way to disable availability markup on Apple platforms
 // in order to work around Clang bug https://github.com/llvm/llvm-project/issues/134151.
 //
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
index eb88d90ca18bd..a496938219b05 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
@@ -12,9 +12,6 @@
 //   Pass-by-value arguments with alignment greater than register width are not supported.
 // XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && clang-18
 
-// This test crashes AppleClang 15 but not later versions.
-// UNSUPPORTED: apple-clang-15
-
 // FIXME: The following issue occurs on Windows to Armv7 Ubuntu Linux:
 //   Assertion failed: N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type"
 // XFAIL: target=armv7-unknown-linux-gnueabihf
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.writefail.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.writefail.pass.cpp
new file mode 100644
index 0000000000000..27e06982d749b
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.writefail.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-filesystem
+
+// setrlimit(RLIMIT_FSIZE) seems to only work as intended on Apple platforms
+// REQUIRES: target={{.+}}-apple-{{.+}}
+
+// <fstream>
+
+// Make sure that we properly handle the case where we try to write content to a file
+// but we fail to do so because std::fwrite fails.
+
+#include <cassert>
+#include <csignal>
+#include <cstddef>
+#include <fstream>
+#include <string>
+
+#include "platform_support.h"
+#include "test_macros.h"
+
+#if __has_include(<sys/resource.h>)
+#  include <sys/resource.h>
+void limit_file_size_to(std::size_t bytes) {
+  rlimit lim = {bytes, bytes};
+  assert(setrlimit(RLIMIT_FSIZE, &lim) == 0);
+
+  std::signal(SIGXFSZ, [](int) {}); // ignore SIGXFSZ to ensure std::fwrite fails
+}
+#else
+#  error No known way to limit the amount of filesystem space available
+#endif
+
+template <class CharT>
+void test() {
+  std::string temp = get_temp_file_name();
+  std::basic_filebuf<CharT> fbuf;
+  assert(fbuf.open(temp, std::ios::out | std::ios::trunc));
+
+  std::size_t const limit = 100000;
+  limit_file_size_to(limit);
+
+  std::basic_string<CharT> large_block(limit / 10, CharT(42));
+
+  std::streamsize ret;
+  std::size_t bytes_written = 0;
+  while ((ret = fbuf.sputn(large_block.data(), large_block.size())) != 0) {
+    bytes_written += ret;
+
+    // In theory, it's possible for an implementation to allow writing arbitrarily more bytes than
+    // set by setrlimit, but in practice if we bust 100x our limit, something else is wrong with the
+    // test and we'd end up looping forever.
+    assert(bytes_written < 100 * limit);
+  }
+
+  fbuf.close();
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.pass.cpp
index 61fd0a804ecd3..f15f1b96b4b27 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.pass.cpp
@@ -62,14 +62,14 @@ int main(int, char**)
     {
         testbuf<char> sb1;
         std::ostream os1(&sb1);
-        int n1;
+        int n1 = 0;
         os1 << &n1;
         assert(os1.good());
         std::string s1(sb1.str());
 
         testbuf<char> sb2;
         std::ostream os2(&sb2);
-        int n2;
+        int n2 = 0;
         os2 << &n2;
         assert(os2.good());
         std::string s2(sb2.str());
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
index 69d84f640d54e..6a1cde15a69bd 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.inserters.arithmetic/pointer.volatile.pass.cpp
@@ -61,7 +61,7 @@ class testbuf : public std::basic_streambuf<CharT> {
 int main(int, char**) {
   testbuf<char> sb1;
   std::ostream os1(&sb1);
-  int n1;
+  int n1 = 0;
   os1 << &n1;
   assert(os1.good());
   std::string s1 = sb1.str();
@@ -74,7 +74,7 @@ int main(int, char**) {
 
   testbuf<char> sb3;
   std::ostream os3(&sb3);
-  volatile int n3;
+  volatile int n3 = 0;
   os3 << &n3;
   assert(os3.good());
   std::string s3 = sb3.str();
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index 175cda39abc24..4cd8fadab2793 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -12,7 +12,6 @@
 
 // These compiler versions and platforms don't enable sized deallocation by default.
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index 29229650cc19e..7b96b01caeda8 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -12,7 +12,6 @@
 
 // These compiler versions and platforms don't enable sized deallocation by default.
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.dynamic/ptr.launder/launder.types.verify.cpp b/libcxx/test/std/language.support/support.dynamic/ptr.launder/launder.types.verify.cpp
index 4f2b627b8601a..5f6e53f98ad34 100644
--- a/libcxx/test/std/language.support/support.dynamic/ptr.launder/launder.types.verify.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/ptr.launder/launder.types.verify.cpp
@@ -25,12 +25,11 @@ int main(int, char**) {
   (void)std::launder((void*)nullptr);
   (void)std::launder((const void*)nullptr);
   (void)std::launder((volatile void*)nullptr);
-  (void)std::launder(
-      (const volatile void*)nullptr); // expected-error-re@*:* 4 {{static assertion failed{{.*}}can't launder cv-void}}
-  // expected-error@*:* 0-4 {{void pointer argument to '__builtin_launder' is not allowed}}
+  (void)std::launder((const volatile void*)nullptr);
+  // expected-error@*:* 4 {{void pointer argument to '__builtin_launder' is not allowed}}
 
-  (void)std::launder(foo); // expected-error-re@*:* 1 {{static assertion failed{{.*}}can't launder functions}}
-  // expected-error@*:* 0-1 {{function pointer argument to '__builtin_launder' is not allowed}}
+  (void)std::launder(foo);
+  // expected-error@*:* 1 {{function pointer argument to '__builtin_launder' is not allowed}}
 
   return 0;
 }
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
index 488bc468bce79..bc479f1bcb1e0 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp
@@ -459,4 +459,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp
index 7f3d6394749b4..fe0e6bd17f94d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp
index 9e50976e5cc2c..30efb61893a1b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp
@@ -171,4 +171,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
index e6145bbed5af9..3470e2b28bc40 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp
@@ -420,4 +420,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp
index 0d025923728b7..a908c417df48b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp
index 35033419ac440..cad025eee3373 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp
@@ -195,4 +195,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp
index ea61d99736208..8799a1f7d14e5 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp
@@ -90,4 +90,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp
index 52b02562dc5ab..6ec3037c9ea45 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp
@@ -123,4 +123,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp
index 1453938b01da0..d5d7a5da4a64d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp
@@ -108,4 +108,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp
index 507c7ab6084f8..26ebe1e3ad6b1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp
@@ -204,4 +204,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
index 56759a88a7348..907535a087de2 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp
index b5efa984b456a..9a3a644ca5d64 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp
@@ -105,4 +105,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp
index d9b2c43ecbd12..e4058c2348f9b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp
index b472b205f89d5..24a9eca1e2346 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp
index ccc034418cde0..bc65a7f3cae00 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp
index f250798c129ea..600fa2eb2e4f5 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp
@@ -75,4 +75,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp
index 675c918cac417..8445aa3cf0c48 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
index eff8689be9fb8..b634f3253093e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp
@@ -201,4 +201,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp
index 60d6418c7459a..11d5735007f5b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp
index b843aab42e6eb..77a6455e23302 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp
@@ -126,4 +126,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
index 9c7a84f145dde..74cf85ea9029f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp
@@ -129,4 +129,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
index 98acf8bb602ca..9c28db3bb0869 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
@@ -179,4 +179,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
index 19e2fd79a4295..9c06eee27e0c8 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
@@ -63,4 +63,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
index d078f9bda23c9..5985bdc2d7d4f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp
@@ -63,4 +63,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
index 3fa4334143b1a..77730f17fd9c6 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp
@@ -147,4 +147,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
index 05f903dccafe7..d2082946597cb 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
@@ -297,4 +297,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
index ee32346d61080..f67adb0de1ded 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp
@@ -68,4 +68,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
index 8c0820681188d..b7b7d0334830a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp
@@ -579,4 +579,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
index 37deba7c9661a..4de327cbfa26b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp
@@ -104,4 +104,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp
index 179c3ce066b6f..68816936c55e9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp
@@ -65,4 +65,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp
index 46238896f79c3..a1178b22776f1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp
index 75dcb18a5428c..e9805ed4b1542 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp
@@ -315,4 +315,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp
index 6857c54460650..8e105648becef 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp
index 0b3d6f5d2bd9c..f4cc8db0f54cb 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp
@@ -84,4 +84,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
index d10c61c0e9cf4..1407d74e03aa2 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp
@@ -297,4 +297,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp
index e1a04d1b0e087..f516881651b23 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
index 4044c2b1b2e0f..3db3861c72b5c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp
@@ -396,4 +396,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp
index e6b4adac20efb..fad0e5b9777dd 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp
@@ -156,4 +156,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
index bf02dba0da773..f287e1ad9b3ad 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp
@@ -678,4 +678,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp
index 52fc2d1854fec..dddf473f86a42 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp
@@ -144,4 +144,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
index fb3734fff10e5..5ffa5df8841c9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp
@@ -95,4 +95,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp
index b1de3f7629e9d..3797e0966ec31 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp
@@ -213,4 +213,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp
index e8f109610a3e5..27170d1ea0ce7 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
index 687c343e34e08..cafbd2cac2ccf 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
@@ -252,4 +252,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
index 32685972d6019..148a6dbc0d3e4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp
@@ -168,4 +168,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
index de0520af18e2a..163ea5b5514e4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp
@@ -128,4 +128,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
index 263d20ace2fd9..0382d93cb40c9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp
@@ -77,4 +77,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
index 0ebfc6de84104..db32433ff518e 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp
@@ -120,4 +120,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp
index d40d115443977..d0ede1168dfa1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp
@@ -99,4 +99,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
index 4cf5178dd7b8f..df19f03e7dba1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
@@ -450,4 +450,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp
index 6507e1c683f24..b7c08fe0de42c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp
@@ -60,4 +60,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp
index d6acf35d63ab0..dc27dc91851a5 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp
@@ -71,4 +71,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp
index 4246f2515dc09..9dc2d8b876640 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp
@@ -69,4 +69,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp
index fd0f0c51e72b2..c9cae7340e215 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
index 80eae6e1fd274..5dc69f29d0ecd 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp
@@ -318,4 +318,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
index 4392173ebbb3a..51feff2195c3d 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
@@ -164,4 +164,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp
index 2b326e2b37832..9495e319521c1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp
@@ -66,4 +66,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
index 3c550e0fa676e..826471a65f691 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
@@ -120,4 +120,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp
index b7650c436128e..992e31ed602e3 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp
@@ -62,4 +62,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
index 1e530ccc3043d..61c5ed476228c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp
@@ -93,4 +93,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp
index 113ffce2a5d12..c07d935106ea6 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp
@@ -65,4 +65,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
index ac70b0c21e018..6f6c4bbbde808 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
index 40a6c07081008..7236d5d7f2aca 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
@@ -486,4 +486,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
index bda523614106c..c7bafb0bf059c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp
@@ -249,4 +249,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp
index 0eaf9f1aff4fe..589b9ba5a75df 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp
@@ -86,4 +86,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp
index e6c44a223ee89..a2a81a619d93c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp
@@ -128,4 +128,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
index b583edfc43ad0..b10441fee5eb9 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp
@@ -333,4 +333,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
index e6c0940ab7fd5..0074f3bf4cc57 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp
@@ -996,4 +996,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp
index 0729b0b37ee6a..cf29080ea75b4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp
@@ -63,4 +63,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
index 74b3c8fff69b3..221d8aaebc14b 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp
@@ -390,4 +390,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
index 9c400ddd2f657..d1c1335df7c80 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp
@@ -312,4 +312,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
index 7dd3478576331..02e7febf5c5a1 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp
@@ -492,4 +492,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
index 4a7b9f7431a81..dea2f293f4c49 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp
@@ -135,4 +135,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
index c2513ecad8d08..e34800a89c950 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp
@@ -270,4 +270,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 222d562a19d63..962688e06188a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -8154,4 +8154,3 @@
 #endif // TEST_STD_VER > 23
 
 // clang-format on
-
diff --git a/libcxx/test/libcxx/numerics/c.math/fdelayed-template-parsing.pass.cpp b/libcxx/test/std/numerics/c.math/fdelayed-template-parsing.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/numerics/c.math/fdelayed-template-parsing.pass.cpp
rename to libcxx/test/std/numerics/c.math/fdelayed-template-parsing.pass.cpp
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 46cdfd7cab1f4..2ab4c11b911b6 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-18, clang-19, apple-clang-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-18, clang-19, apple-clang-16, apple-clang-17
 
 // GCC warns about signbit comparing `bool_v < 0`, which we're testing
 // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
index 43481323e800c..6bd112c7d1280 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++26
 
 // The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-15, apple-clang-16
+// UNSUPPORTED: apple-clang-16
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
index ef312b7e3a911..bdfc57694dd53 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++26
 
 // The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-15, apple-clang-16
+// UNSUPPORTED: apple-clang-16
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
index 94bc7ad7c48d4..1fe7916c67823 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++26
 
 // The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-15, apple-clang-16
+// UNSUPPORTED: apple-clang-16
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
index 79d6e2643fc4e..b797ae7533add 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++26
 
 // The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-15, apple-clang-16
+// UNSUPPORTED: apple-clang-16
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
index 92ddc30aefc66..8b6188f1fad0e 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++26
 
 // The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-15, apple-clang-16
+// UNSUPPORTED: apple-clang-16
 
 // <numeric>
 
diff --git a/libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_int_type.verify.cpp b/libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_int_type.verify.cpp
similarity index 100%
rename from libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_int_type.verify.cpp
rename to libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_int_type.verify.cpp
diff --git a/libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_real_type.verify.cpp b/libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_real_type.verify.cpp
similarity index 100%
rename from libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_real_type.verify.cpp
rename to libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_real_type.verify.cpp
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp
index 0ca8d92800feb..94d2bd47e9806 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp
@@ -59,7 +59,7 @@ constexpr bool test() {
 
   {
     //  bidi
-    int buffer[2] = {1, 2};
+    int buffer[3] = {1, 2, 3};
 
     std::ranges::zip_view v(BidiCommonView{buffer});
     auto it = v.begin();
@@ -81,7 +81,7 @@ constexpr bool test() {
 
   {
     //  forward
-    int buffer[2] = {1, 2};
+    int buffer[3] = {1, 2, 3};
 
     std::ranges::zip_view v(ForwardSizedView{buffer});
     auto it = v.begin();
diff --git a/libcxx/test/std/ranges/range.utility/range.utility.conv/to_deduction.pass.cpp b/libcxx/test/std/ranges/range.utility/range.utility.conv/to_deduction.pass.cpp
index 58307bd88d0fe..29572ab3c4b13 100644
--- a/libcxx/test/std/ranges/range.utility/range.utility.conv/to_deduction.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/range.utility.conv/to_deduction.pass.cpp
@@ -7,9 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-// There is a bug in older versions of Clang that causes trouble with constraints in classes like
-// `ContainerWithDirectCtr`.
-// XFAIL: apple-clang-15
 
 // template<template<class...> class C, input_range R, class... Args>
 //   constexpr auto to(R&& r, Args&&... args);  // Since C++23
diff --git a/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
similarity index 94%
rename from libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp
rename to libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
index cc8c743830826..9ff148251a05f 100644
--- a/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
@@ -12,7 +12,6 @@
 // These compiler versions and platforms don't enable sized deallocation by default.
 // ADDITIONAL_COMPILE_FLAGS(clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
index 09086a4c046d6..8e57e8913dcbe 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
@@ -7,9 +7,6 @@
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // is_bounded_array<T>
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
index 9aac871f2633f..bd7da40daf2bc 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // has_unique_object_representations
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index afd76e65060e3..192943dd820cc 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, gcc-15, apple-clang-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-18, clang-19, gcc-14, gcc-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index 34462f9bf0ec6..e5b10f57be084 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
index 5b3753c67381f..03bec8c2f81d2 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++23
 
 // These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: android, apple-clang-15, apple-clang-16, clang-19.1
+// UNSUPPORTED: android, apple-clang-16, clang-19.1
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
index 849e286c8cdab..82688b10dbf45 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: std-at-least-c++23
 
 // These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: android, apple-clang-15, apple-clang-16, clang-18, clang-19.1
+// UNSUPPORTED: android, apple-clang-16, clang-18, clang-19.1
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/optional/optional.comp_with_t/equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.comp_with_t/equal.pass.cpp
index 0636da7bcdf23..54965b270dcce 100644
--- a/libcxx/test/std/utilities/optional/optional.comp_with_t/equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.comp_with_t/equal.pass.cpp
@@ -14,8 +14,29 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+
+static_assert(HasOperatorEqual<int, std::optional<int>>);
+static_assert(HasOperatorEqual<int, std::optional<EqualityComparable>>);
+static_assert(HasOperatorEqual<EqualityComparable, std::optional<EqualityComparable>>);
+
+static_assert(!HasOperatorEqual<NonComparable, std::optional<NonComparable>>);
+static_assert(!HasOperatorEqual<NonComparable, std::optional<EqualityComparable>>);
+
+static_assert(HasOperatorEqual<std::optional<int>, int>);
+static_assert(HasOperatorEqual<std::optional<EqualityComparable>, int>);
+static_assert(HasOperatorEqual<std::optional<EqualityComparable>, EqualityComparable>);
+
+static_assert(!HasOperatorEqual<std::optional<NonComparable>, NonComparable>);
+static_assert(!HasOperatorEqual<std::optional<EqualityComparable>, NonComparable>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.comp_with_t/greater.pass.cpp b/libcxx/test/std/utilities/optional/optional.comp_with_t/greater.pass.cpp
index 2010157a8d011..b63b85b000930 100644
--- a/libcxx/test/std/utilities/optional/optional.comp_with_t/greater.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.comp_with_t/greater.pass.cpp
@@ -14,8 +14,28 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorGreaterThan<std::optional<ThreeWayComparable>, int>);
+static_assert(HasOperatorGreaterThan<std::optional<ThreeWayComparable>, ThreeWayComparable>);
+
+static_assert(!HasOperatorGreaterThan<std::optional<NonComparable>, NonComparable>);
+static_assert(!HasOperatorGreaterThan<std::optional<ThreeWayComparable>, NonComparable>);
+static_assert(!HasOperatorGreaterThan<std::optional<NonComparable>, ThreeWayComparable>);
+
+static_assert(HasOperatorGreaterThan<int, std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorGreaterThan<ThreeWayComparable, std::optional<ThreeWayComparable>>);
+
+static_assert(!HasOperatorGreaterThan<NonComparable, std::optional<NonComparable>>);
+static_assert(!HasOperatorGreaterThan<NonComparable, std::optional<ThreeWayComparable>>);
+static_assert(!HasOperatorGreaterThan<ThreeWayComparable, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.comp_with_t/greater_equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.comp_with_t/greater_equal.pass.cpp
index 2e1e9b9316111..bb1a8c551d3d5 100644
--- a/libcxx/test/std/utilities/optional/optional.comp_with_t/greater_equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.comp_with_t/greater_equal.pass.cpp
@@ -14,8 +14,28 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorGreaterThanEqual<std::optional<ThreeWayComparable>, int>);
+static_assert(HasOperatorGreaterThanEqual<std::optional<ThreeWayComparable>, ThreeWayComparable>);
+
+static_assert(!HasOperatorGreaterThanEqual<std::optional<NonComparable>, NonComparable>);
+static_assert(!HasOperatorGreaterThanEqual<std::optional<ThreeWayComparable>, NonComparable>);
+static_assert(!HasOperatorGreaterThanEqual<std::optional<NonComparable>, ThreeWayComparable>);
+
+static_assert(HasOperatorGreaterThanEqual<int, std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorGreaterThanEqual<ThreeWayComparable, std::optional<ThreeWayComparable>>);
+
+static_assert(!HasOperatorGreaterThanEqual<NonComparable, std::optional<NonComparable>>);
+static_assert(!HasOperatorGreaterThanEqual<NonComparable, std::optional<ThreeWayComparable>>);
+static_assert(!HasOperatorGreaterThanEqual<ThreeWayComparable, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.comp_with_t/less_equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.comp_with_t/less_equal.pass.cpp
index 7026c24de5dbf..8b08ec3deea5a 100644
--- a/libcxx/test/std/utilities/optional/optional.comp_with_t/less_equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.comp_with_t/less_equal.pass.cpp
@@ -14,8 +14,28 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorLessThanEqual<std::optional<ThreeWayComparable>, int>);
+static_assert(HasOperatorLessThanEqual<std::optional<ThreeWayComparable>, ThreeWayComparable>);
+
+static_assert(!HasOperatorLessThanEqual<std::optional<NonComparable>, NonComparable>);
+static_assert(!HasOperatorLessThanEqual<std::optional<ThreeWayComparable>, NonComparable>);
+static_assert(!HasOperatorLessThanEqual<std::optional<NonComparable>, ThreeWayComparable>);
+
+static_assert(HasOperatorLessThanEqual<int, std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorLessThanEqual<ThreeWayComparable, std::optional<ThreeWayComparable>>);
+
+static_assert(!HasOperatorLessThanEqual<NonComparable, std::optional<NonComparable>>);
+static_assert(!HasOperatorLessThanEqual<NonComparable, std::optional<ThreeWayComparable>>);
+static_assert(!HasOperatorLessThanEqual<ThreeWayComparable, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.comp_with_t/less_than.pass.cpp b/libcxx/test/std/utilities/optional/optional.comp_with_t/less_than.pass.cpp
index b3d98684ba20b..3ef23a3224520 100644
--- a/libcxx/test/std/utilities/optional/optional.comp_with_t/less_than.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.comp_with_t/less_than.pass.cpp
@@ -14,8 +14,28 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorLessThan<std::optional<ThreeWayComparable>, int>);
+static_assert(HasOperatorLessThan<std::optional<ThreeWayComparable>, ThreeWayComparable>);
+
+static_assert(!HasOperatorLessThan<std::optional<NonComparable>, NonComparable>);
+static_assert(!HasOperatorLessThan<std::optional<ThreeWayComparable>, NonComparable>);
+static_assert(!HasOperatorLessThan<std::optional<NonComparable>, ThreeWayComparable>);
+
+static_assert(HasOperatorLessThan<int, std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorLessThan<ThreeWayComparable, std::optional<ThreeWayComparable>>);
+
+static_assert(!HasOperatorLessThan<NonComparable, std::optional<NonComparable>>);
+static_assert(!HasOperatorLessThan<NonComparable, std::optional<ThreeWayComparable>>);
+static_assert(!HasOperatorLessThan<ThreeWayComparable, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.comp_with_t/not_equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.comp_with_t/not_equal.pass.cpp
index 36bc526e763fe..97fb8ee447393 100644
--- a/libcxx/test/std/utilities/optional/optional.comp_with_t/not_equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.comp_with_t/not_equal.pass.cpp
@@ -14,8 +14,29 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+
+static_assert(HasOperatorNotEqual<int, std::optional<int>>);
+static_assert(HasOperatorNotEqual<int, std::optional<EqualityComparable>>);
+static_assert(HasOperatorNotEqual<EqualityComparable, std::optional<EqualityComparable>>);
+
+static_assert(!HasOperatorNotEqual<NonComparable, std::optional<NonComparable>>);
+static_assert(!HasOperatorNotEqual<NonComparable, std::optional<EqualityComparable>>);
+
+static_assert(HasOperatorNotEqual<std::optional<int>, int>);
+static_assert(HasOperatorNotEqual<std::optional<EqualityComparable>, int>);
+static_assert(HasOperatorNotEqual<std::optional<EqualityComparable>, EqualityComparable>);
+
+static_assert(!HasOperatorNotEqual<std::optional<NonComparable>, NonComparable>);
+static_assert(!HasOperatorNotEqual<std::optional<EqualityComparable>, NonComparable>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.relops/equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.relops/equal.pass.cpp
index 96bda6a5afa26..6915b604d4ef4 100644
--- a/libcxx/test/std/utilities/optional/optional.relops/equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.relops/equal.pass.cpp
@@ -15,8 +15,22 @@
 #include <type_traits>
 #include <cassert>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+
+static_assert(HasOperatorEqual<std::optional<int>>);
+static_assert(HasOperatorEqual<std::optional<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::optional<EqualityComparable>, std::optional<int>>);
+
+static_assert(!HasOperatorEqual<std::optional<NonComparable>>);
+static_assert(!HasOperatorEqual<std::optional<EqualityComparable>, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.relops/greater_equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.relops/greater_equal.pass.cpp
index 399823c868b5e..d14e7df6abc80 100644
--- a/libcxx/test/std/utilities/optional/optional.relops/greater_equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.relops/greater_equal.pass.cpp
@@ -13,8 +13,21 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorGreaterThanEqual<std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorGreaterThanEqual<std::optional<ThreeWayComparable>, std::optional<int>>);
+
+static_assert(!HasOperatorGreaterThanEqual<std::optional<NonComparable>>);
+static_assert(!HasOperatorGreaterThanEqual<std::optional<EqualityComparable>>);
+static_assert(!HasOperatorGreaterThanEqual<std::optional<ThreeWayComparable>, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.relops/greater_than.pass.cpp b/libcxx/test/std/utilities/optional/optional.relops/greater_than.pass.cpp
index 6b877c3011cb0..f19ea23e8ae78 100644
--- a/libcxx/test/std/utilities/optional/optional.relops/greater_than.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.relops/greater_than.pass.cpp
@@ -13,8 +13,21 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorGreaterThan<std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorGreaterThan<std::optional<ThreeWayComparable>, std::optional<int>>);
+
+static_assert(!HasOperatorGreaterThan<std::optional<NonComparable>>);
+static_assert(!HasOperatorGreaterThan<std::optional<EqualityComparable>>);
+static_assert(!HasOperatorGreaterThan<std::optional<ThreeWayComparable>, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.relops/less_equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.relops/less_equal.pass.cpp
index aa605cd3a1e44..bd81a79f1265e 100644
--- a/libcxx/test/std/utilities/optional/optional.relops/less_equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.relops/less_equal.pass.cpp
@@ -13,8 +13,21 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorLessThanEqual<std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorLessThanEqual<std::optional<ThreeWayComparable>, std::optional<int>>);
+
+static_assert(!HasOperatorLessThanEqual<std::optional<NonComparable>>);
+static_assert(!HasOperatorLessThanEqual<std::optional<EqualityComparable>>);
+static_assert(!HasOperatorLessThanEqual<std::optional<ThreeWayComparable>, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.relops/less_than.pass.cpp b/libcxx/test/std/utilities/optional/optional.relops/less_than.pass.cpp
index 53474ebc5f0e9..c87bfd6f73e54 100644
--- a/libcxx/test/std/utilities/optional/optional.relops/less_than.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.relops/less_than.pass.cpp
@@ -13,8 +13,21 @@
 
 #include <optional>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+static_assert(HasOperatorLessThan<std::optional<ThreeWayComparable>>);
+static_assert(HasOperatorLessThan<std::optional<ThreeWayComparable>, std::optional<int>>);
+
+static_assert(!HasOperatorLessThan<std::optional<NonComparable>>);
+static_assert(!HasOperatorLessThan<std::optional<EqualityComparable>>);
+static_assert(!HasOperatorLessThan<std::optional<ThreeWayComparable>, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/optional/optional.relops/not_equal.pass.cpp b/libcxx/test/std/utilities/optional/optional.relops/not_equal.pass.cpp
index 290459d9ed090..1b7d2621c9193 100644
--- a/libcxx/test/std/utilities/optional/optional.relops/not_equal.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.relops/not_equal.pass.cpp
@@ -15,8 +15,22 @@
 #include <type_traits>
 #include <cassert>
 
+#include "test_comparisons.h"
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 26
+
+// Test SFINAE.
+
+static_assert(HasOperatorNotEqual<std::optional<int>>);
+static_assert(HasOperatorNotEqual<std::optional<EqualityComparable>>);
+static_assert(HasOperatorNotEqual<std::optional<EqualityComparable>, std::optional<int>>);
+
+static_assert(!HasOperatorNotEqual<std::optional<NonComparable>>);
+static_assert(!HasOperatorNotEqual<std::optional<EqualityComparable>, std::optional<NonComparable>>);
+
+#endif
+
 using std::optional;
 
 struct X {
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
index 5df6103d79e7a..3c5a57d1c7fec 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp
@@ -11,6 +11,7 @@
 #include <bitset>
 #include <algorithm>
 #include <type_traits>
+#include <limits>
 #include <climits>
 #include <cassert>
 #include <stdexcept>
diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/resource.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/resource.pass.cpp
index 23865f67a694b..22cb72414e975 100644
--- a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/resource.pass.cpp
+++ b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/resource.pass.cpp
@@ -17,11 +17,19 @@
 // memory_resource *
 // polymorphic_allocator<T>::resource() const
 
-#include <memory_resource>
 #include <cassert>
+#include <cstddef>
+#include <memory_resource>
+#include <new>
 
 #include "test_macros.h"
 
+struct resource : std::pmr::memory_resource {
+  void* do_allocate(size_t, size_t) override { TEST_THROW(std::bad_alloc()); }
+  void do_deallocate(void*, size_t, size_t) override { assert(false); }
+  bool do_is_equal(const std::pmr::memory_resource&) const noexcept override { return false; }
+};
+
 int main(int, char**) {
   typedef std::pmr::polymorphic_allocator<void> A;
   {
@@ -29,24 +37,19 @@ int main(int, char**) {
     ASSERT_SAME_TYPE(decltype(a.resource()), std::pmr::memory_resource*);
   }
   {
-    std::pmr::memory_resource* mptr = (std::pmr::memory_resource*)42;
-    A const a(mptr);
-    assert(a.resource() == mptr);
-  }
-  {
-    A const a(nullptr);
-    assert(a.resource() == nullptr);
-    assert(a.resource() == nullptr);
+    resource res;
+    A const a(&res);
+    assert(a.resource() == &res);
   }
   {
     A const a;
     assert(a.resource() == std::pmr::get_default_resource());
   }
   {
-    std::pmr::memory_resource* mptr = (std::pmr::memory_resource*)42;
-    std::pmr::set_default_resource(mptr);
+    resource res;
+    std::pmr::set_default_resource(&res);
     A const a;
-    assert(a.resource() == mptr);
+    assert(a.resource() == &res);
     assert(a.resource() == std::pmr::get_default_resource());
   }
 
diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/select_on_container_copy_construction.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/select_on_container_copy_construction.pass.cpp
index 58d6cccd244cf..ccac1178fe516 100644
--- a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/select_on_container_copy_construction.pass.cpp
+++ b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/select_on_container_copy_construction.pass.cpp
@@ -17,11 +17,19 @@
 // polymorphic_allocator
 // polymorphic_allocator<T>::select_on_container_copy_construction() const
 
-#include <memory_resource>
 #include <cassert>
+#include <cstddef>
+#include <memory_resource>
+#include <new>
 
 #include "test_macros.h"
 
+struct resource : std::pmr::memory_resource {
+  void* do_allocate(size_t, size_t) override { TEST_THROW(std::bad_alloc()); }
+  void do_deallocate(void*, size_t, size_t) override { assert(false); }
+  bool do_is_equal(const std::pmr::memory_resource&) const noexcept override { return false; }
+};
+
 int main(int, char**) {
   typedef std::pmr::polymorphic_allocator<void> A;
   {
@@ -29,22 +37,12 @@ int main(int, char**) {
     ASSERT_SAME_TYPE(decltype(a.select_on_container_copy_construction()), A);
   }
   {
-    std::pmr::memory_resource* mptr = (std::pmr::memory_resource*)42;
-    A const a(mptr);
-    assert(a.resource() == mptr);
-    A const other = a.select_on_container_copy_construction();
-    assert(other.resource() == std::pmr::get_default_resource());
-    assert(a.resource() == mptr);
-  }
-  {
-    std::pmr::memory_resource* mptr = (std::pmr::memory_resource*)42;
-    std::pmr::set_default_resource(mptr);
-    A const a(nullptr);
-    assert(a.resource() == nullptr);
+    resource res;
+    A const a(&res);
+    assert(a.resource() == &res);
     A const other = a.select_on_container_copy_construction();
     assert(other.resource() == std::pmr::get_default_resource());
-    assert(other.resource() == mptr);
-    assert(a.resource() == nullptr);
+    assert(a.resource() == &res);
   }
 
   return 0;
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 0a1985b02807b..63ceceaa67635 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -184,7 +184,7 @@ FROM ubuntu:jammy AS android-builder-base
 
 ARG ANDROID_CLANG_VERSION
 ARG ANDROID_CLANG_PREBUILTS_COMMIT
-ARG ANDROID_SYSROOT_BID
+ARG ANDROID_SYSROOT_COMMIT
 
 RUN apt-get update && apt-get install -y curl bzip2 git unzip
 
@@ -217,19 +217,18 @@ RUN <<EOF
     ls /opt/android/clang/clang-current/bin/clang
 EOF
 
-# Install an Android sysroot. New AOSP sysroots are available at
-# https://ci.android.com/builds/branches/aosp-main/grid, the "ndk" target. The
-# NDK also makes its sysroot prebuilt available at
-# https://android.googlesource.com/platform/prebuilts/ndk/+/refs/heads/dev/platform/sysroot.
+# Install an Android sysroot. New Android sysroots are available at
+# https://android.googlesource.com/platform/prebuilts/ndk/+/refs/heads/mirror-goog-main-ndk/platform/sysroot.
 
-ENV ANDROID_SYSROOT_BID=$ANDROID_SYSROOT_BID
+ENV ANDROID_SYSROOT_COMMIT=$ANDROID_SYSROOT_COMMIT
 RUN <<EOF
   set -e
-  cd /opt/android
-  curl -L -o ndk_platform.tar.bz2 \
-      https://androidbuildinternal.googleapis.com/android/internal/build/v3/builds/${ANDROID_SYSROOT_BID}/ndk/attempts/latest/artifacts/ndk_platform.tar.bz2/url
-  tar xf ndk_platform.tar.bz2
-  rm ndk_platform.tar.bz2
+  mkdir -p /opt/android/ndk
+  cd /opt/android/ndk
+  git clone --filter=blob:none https://android.googlesource.com/platform/prebuilts/ndk tmp
+  git -C tmp checkout ${ANDROID_SYSROOT_COMMIT}
+  mv tmp/platform/sysroot .
+  rm -rf tmp
 EOF
 
 # ===----------------------------------------------------------------------===##
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index 2189a41555c2f..4efc6d2a570e3 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -21,7 +21,7 @@ services:
       target: android-buildkite-builder
       args:
         BASE_IMAGE: ubuntu:noble
-        ANDROID_CLANG_VERSION: r536225
-        ANDROID_CLANG_PREBUILTS_COMMIT: 3f67b93ee7a50ae2a3cb34cc32d0589415cc0a9c
-        ANDROID_SYSROOT_BID: 12644632
+        ANDROID_CLANG_VERSION: r563880
+        ANDROID_CLANG_PREBUILTS_COMMIT: 6ae4184bb8706f9731569b9a0a82be3fcdcb951c
+        ANDROID_SYSROOT_COMMIT: f8b85cc5262c6e5cbc9a92c1bab2b18b32a4c63f
         <<: *compiler_versions
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 18eb8a8623748..fe175fd758726 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1882,7 +1882,6 @@ def produce_tests():
 {cxx_tests}
 
 // clang-format on
-
 """.format(
             script_name=script_name,
             header=h,
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index 726e425b36dc6..479131a24dcfc 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "InputFiles.h"
+#include "OutputSections.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/ELFAttributes.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/HexagonAttributeParser.h"
+#include "llvm/Support/HexagonAttributes.h"
+#include "llvm/Support/LEB128.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -415,4 +422,116 @@ int64_t Hexagon::getImplicitAddend(const uint8_t *buf, RelType type) const {
   }
 }
 
+namespace {
+class HexagonAttributesSection final : public SyntheticSection {
+public:
+  HexagonAttributesSection(Ctx &ctx)
+      : SyntheticSection(ctx, ".hexagon.attributes", SHT_HEXAGON_ATTRIBUTES, 0,
+                         1) {}
+
+  size_t getSize() const override { return size; }
+  void writeTo(uint8_t *buf) override;
+
+  static constexpr StringRef vendor = "hexagon";
+  DenseMap<unsigned, unsigned> intAttr;
+  size_t size = 0;
+};
+} // namespace
+
+static HexagonAttributesSection *
+mergeAttributesSection(Ctx &ctx,
+                       const SmallVector<InputSectionBase *, 0> &sections) {
+  ctx.in.hexagonAttributes = std::make_unique<HexagonAttributesSection>(ctx);
+  auto &merged =
+      static_cast<HexagonAttributesSection &>(*ctx.in.hexagonAttributes);
+
+  // Collect all tags values from attributes section.
+  const auto &attributesTags = HexagonAttrs::getHexagonAttributeTags();
+  for (const InputSectionBase *sec : sections) {
+    HexagonAttributeParser parser;
+    if (Error e = parser.parse(sec->content(), llvm::endianness::little))
+      Warn(ctx) << sec << ": " << std::move(e);
+    for (const auto &tag : attributesTags) {
+      switch (HexagonAttrs::AttrType(tag.attr)) {
+      case HexagonAttrs::ARCH:
+      case HexagonAttrs::HVXARCH:
+        if (auto i = parser.getAttributeValue(tag.attr)) {
+          auto r = merged.intAttr.try_emplace(tag.attr, *i);
+          if (!r.second)
+            if (r.first->second < *i)
+              r.first->second = *i;
+        }
+        continue;
+
+      case HexagonAttrs::HVXIEEEFP:
+      case HexagonAttrs::HVXQFLOAT:
+      case HexagonAttrs::ZREG:
+      case HexagonAttrs::AUDIO:
+      case HexagonAttrs::CABAC:
+        if (auto i = parser.getAttributeValue(tag.attr)) {
+          auto r = merged.intAttr.try_emplace(tag.attr, *i);
+          if (!r.second && r.first->second != *i) {
+            r.first->second |= *i;
+          }
+        }
+        continue;
+      }
+    }
+  }
+
+  // The total size of headers: format-version [ <section-length> "vendor-name"
+  // [ <file-tag> <size>.
+  size_t size = 5 + merged.vendor.size() + 1 + 5;
+  for (auto &attr : merged.intAttr)
+    if (attr.second != 0)
+      size += getULEB128Size(attr.first) + getULEB128Size(attr.second);
+  merged.size = size;
+  return &merged;
+}
+
+void HexagonAttributesSection::writeTo(uint8_t *buf) {
+  const size_t size = getSize();
+  uint8_t *const end = buf + size;
+  *buf = ELFAttrs::Format_Version;
+  write32(ctx, buf + 1, size - 1);
+  buf += 5;
+
+  memcpy(buf, vendor.data(), vendor.size());
+  buf += vendor.size() + 1;
+
+  *buf = ELFAttrs::File;
+  write32(ctx, buf + 1, end - buf);
+  buf += 5;
+
+  for (auto &attr : intAttr) {
+    if (attr.second == 0)
+      continue;
+    buf += encodeULEB128(attr.first, buf);
+    buf += encodeULEB128(attr.second, buf);
+  }
+}
+
+void elf::mergeHexagonAttributesSections(Ctx &ctx) {
+  // Find the first input SHT_HEXAGON_ATTRIBUTES; return if not found.
+  size_t place =
+      llvm::find_if(ctx.inputSections,
+                    [](auto *s) { return s->type == SHT_HEXAGON_ATTRIBUTES; }) -
+      ctx.inputSections.begin();
+  if (place == ctx.inputSections.size())
+    return;
+
+  // Extract all SHT_HEXAGON_ATTRIBUTES sections into `sections`.
+  SmallVector<InputSectionBase *, 0> sections;
+  llvm::erase_if(ctx.inputSections, [&](InputSectionBase *s) {
+    if (s->type != SHT_HEXAGON_ATTRIBUTES)
+      return false;
+    sections.push_back(s);
+    return true;
+  });
+
+  // Add the merged section.
+  ctx.inputSections.insert(ctx.inputSections.begin() + place,
+                           mergeAttributesSection(ctx, sections));
+}
+
 void elf::setHexagonTargetInfo(Ctx &ctx) { ctx.target.reset(new Hexagon(ctx)); }
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 4c771e47a0e72..d9639b06ca4bf 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -568,6 +568,7 @@ struct UndefinedDiag {
 // a partition.
 struct InStruct {
   std::unique_ptr<InputSection> attributes;
+  std::unique_ptr<SyntheticSection> hexagonAttributes;
   std::unique_ptr<SyntheticSection> riscvAttributes;
   std::unique_ptr<BssSection> bss;
   std::unique_ptr<BssSection> bssRelRo;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index a194ae71d559a..21d228eda6470 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -3446,6 +3446,10 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   if (!ctx.arg.relocatable)
     combineEhSections(ctx);
 
+  // Merge .hexagon.attributes sections.
+  if (ctx.arg.emachine == EM_HEXAGON)
+    mergeHexagonAttributesSections(ctx);
+
   // Merge .riscv.attributes sections.
   if (ctx.arg.emachine == EM_RISCV)
     mergeRISCVAttributesSections(ctx);
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index cebd564036b2c..4333b032c9d4e 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1671,8 +1671,9 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
   }
 
   // Sort relocations by offset for more efficient searching for
-  // R_RISCV_PCREL_HI20, R_PPC64_ADDR64 and the branch-to-branch optimization.
-  if (ctx.arg.emachine == EM_RISCV ||
+  // R_RISCV_PCREL_HI20, ALIGN relocations, R_PPC64_ADDR64 and the
+  // branch-to-branch optimization.
+  if (is_contained({EM_RISCV, EM_LOONGARCH}, ctx.arg.emachine) ||
       (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") ||
       ctx.arg.branchToBranch)
     llvm::stable_sort(sec->relocs(),
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 6dd20b2f0cbaa..93f15920bfedb 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -245,6 +245,7 @@ template <typename ELFT> void writeARMCmseImportLib(Ctx &);
 uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type);
 void riscvFinalizeRelax(int passes);
 void mergeRISCVAttributesSections(Ctx &);
+void mergeHexagonAttributesSections(Ctx &);
 void addArmInputSectionMappingSymbols(Ctx &);
 void addArmSyntheticSectionMappingSymbol(Defined *);
 void sortArmMappingSymbols(Ctx &);
diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index 2fe96b26bfb55..04da702b48764 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -14,15 +14,10 @@
 
 #include "lld/Common/ErrorHandler.h"
 #include "mach-o/compact_unwind_encoding.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/MachO.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 using namespace llvm::MachO;
-using namespace llvm::support::endian;
 using namespace lld;
 using namespace lld::macho;
 
@@ -39,7 +34,6 @@ struct ARM64 : ARM64Common {
                             uint64_t &stubOffset, uint64_t selrefVA,
                             Symbol *objcMsgSend) const override;
   void populateThunk(InputSection *thunk, Symbol *funcSym) override;
-  void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
 
   void initICFSafeThunkBody(InputSection *thunk,
                             Symbol *targetSym) const override;
@@ -236,509 +230,6 @@ ARM64::ARM64() : ARM64Common(LP64()) {
   relocAttrs = {relocAttrsArray.data(), relocAttrsArray.size()};
 }
 
-namespace {
-struct Adrp {
-  uint32_t destRegister;
-  int64_t addend;
-};
-
-struct Add {
-  uint8_t destRegister;
-  uint8_t srcRegister;
-  uint32_t addend;
-};
-
-enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 };
-
-struct Ldr {
-  uint8_t destRegister;
-  uint8_t baseRegister;
-  uint8_t p2Size;
-  bool isFloat;
-  ExtendType extendType;
-  int64_t offset;
-};
-} // namespace
-
-static bool parseAdrp(uint32_t insn, Adrp &adrp) {
-  if ((insn & 0x9f000000) != 0x90000000)
-    return false;
-  adrp.destRegister = insn & 0x1f;
-  uint64_t immHi = (insn >> 5) & 0x7ffff;
-  uint64_t immLo = (insn >> 29) & 0x3;
-  adrp.addend = SignExtend64<21>(immLo | (immHi << 2)) * 4096;
-  return true;
-}
-
-static bool parseAdd(uint32_t insn, Add &add) {
-  if ((insn & 0xffc00000) != 0x91000000)
-    return false;
-  add.destRegister = insn & 0x1f;
-  add.srcRegister = (insn >> 5) & 0x1f;
-  add.addend = (insn >> 10) & 0xfff;
-  return true;
-}
-
-static bool parseLdr(uint32_t insn, Ldr &ldr) {
-  ldr.destRegister = insn & 0x1f;
-  ldr.baseRegister = (insn >> 5) & 0x1f;
-  uint8_t size = insn >> 30;
-  uint8_t opc = (insn >> 22) & 3;
-
-  if ((insn & 0x3fc00000) == 0x39400000) {
-    // LDR (immediate), LDRB (immediate), LDRH (immediate)
-    ldr.p2Size = size;
-    ldr.extendType = ZeroExtend;
-    ldr.isFloat = false;
-  } else if ((insn & 0x3f800000) == 0x39800000) {
-    // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate)
-    ldr.p2Size = size;
-    ldr.extendType = static_cast<ExtendType>(opc);
-    ldr.isFloat = false;
-  } else if ((insn & 0x3f400000) == 0x3d400000) {
-    // LDR (immediate, SIMD&FP)
-    ldr.extendType = ZeroExtend;
-    ldr.isFloat = true;
-    if (opc == 1)
-      ldr.p2Size = size;
-    else if (size == 0 && opc == 3)
-      ldr.p2Size = 4;
-    else
-      return false;
-  } else {
-    return false;
-  }
-  ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size;
-  return true;
-}
-
-static bool isValidAdrOffset(int32_t delta) { return isInt<21>(delta); }
-
-static void writeAdr(void *loc, uint32_t dest, int32_t delta) {
-  assert(isValidAdrOffset(delta));
-  uint32_t opcode = 0x10000000;
-  uint32_t immHi = (delta & 0x001ffffc) << 3;
-  uint32_t immLo = (delta & 0x00000003) << 29;
-  write32le(loc, opcode | immHi | immLo | dest);
-}
-
-static void writeNop(void *loc) { write32le(loc, 0xd503201f); }
-
-static bool isLiteralLdrEligible(const Ldr &ldr) {
-  return ldr.p2Size > 1 && isShiftedInt<19, 2>(ldr.offset);
-}
-
-static void writeLiteralLdr(void *loc, const Ldr &ldr) {
-  assert(isLiteralLdrEligible(ldr));
-  uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes<uint32_t>(19)) << 5;
-  uint32_t opcode;
-  switch (ldr.p2Size) {
-  case 2:
-    if (ldr.isFloat)
-      opcode = 0x1c000000;
-    else
-      opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000;
-    break;
-  case 3:
-    opcode = ldr.isFloat ? 0x5c000000 : 0x58000000;
-    break;
-  case 4:
-    opcode = 0x9c000000;
-    break;
-  default:
-    llvm_unreachable("Invalid literal ldr size");
-  }
-  write32le(loc, opcode | imm19 | ldr.destRegister);
-}
-
-static bool isImmediateLdrEligible(const Ldr &ldr) {
-  // Note: We deviate from ld64's behavior, which converts to immediate loads
-  // only if ldr.offset < 4096, even though the offset is divided by the load's
-  // size in the 12-bit immediate operand. Only the unsigned offset variant is
-  // supported.
-
-  uint32_t size = 1 << ldr.p2Size;
-  return ldr.offset >= 0 && (ldr.offset % size) == 0 &&
-         isUInt<12>(ldr.offset >> ldr.p2Size);
-}
-
-static void writeImmediateLdr(void *loc, const Ldr &ldr) {
-  assert(isImmediateLdrEligible(ldr));
-  uint32_t opcode = 0x39000000;
-  if (ldr.isFloat) {
-    opcode |= 0x04000000;
-    assert(ldr.extendType == ZeroExtend);
-  }
-  opcode |= ldr.destRegister;
-  opcode |= ldr.baseRegister << 5;
-  uint8_t size, opc;
-  if (ldr.p2Size == 4) {
-    size = 0;
-    opc = 3;
-  } else {
-    opc = ldr.extendType;
-    size = ldr.p2Size;
-  }
-  uint32_t immBits = ldr.offset >> ldr.p2Size;
-  write32le(loc, opcode | (immBits << 10) | (opc << 22) | (size << 30));
-}
-
-// Transforms a pair of adrp+add instructions into an adr instruction if the
-// target is within the +/- 1 MiB range allowed by the adr's 21 bit signed
-// immediate offset.
-//
-//   adrp xN, _foo@PAGE
-//   add  xM, xN, _foo@PAGEOFF
-// ->
-//   adr  xM, _foo
-//   nop
-static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec,
-                         uint64_t offset1, uint64_t offset2) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  Adrp adrp;
-  Add add;
-  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add))
-    return false;
-  if (adrp.destRegister != add.srcRegister)
-    return false;
-
-  uint64_t addr1 = isec->getVA() + offset1;
-  uint64_t referent = pageBits(addr1) + adrp.addend + add.addend;
-  int64_t delta = referent - addr1;
-  if (!isValidAdrOffset(delta))
-    return false;
-
-  writeAdr(buf + offset1, add.destRegister, delta);
-  writeNop(buf + offset2);
-  return true;
-}
-
-// Transforms two adrp instructions into a single adrp if their referent
-// addresses are located on the same 4096 byte page.
-//
-//   adrp xN, _foo@PAGE
-//   adrp xN, _bar@PAGE
-// ->
-//   adrp xN, _foo@PAGE
-//   nop
-static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec,
-                          uint64_t offset1, uint64_t offset2) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  Adrp adrp1, adrp2;
-  if (!parseAdrp(ins1, adrp1) || !parseAdrp(ins2, adrp2))
-    return;
-  if (adrp1.destRegister != adrp2.destRegister)
-    return;
-
-  uint64_t page1 = pageBits(offset1 + isec->getVA()) + adrp1.addend;
-  uint64_t page2 = pageBits(offset2 + isec->getVA()) + adrp2.addend;
-  if (page1 != page2)
-    return;
-
-  writeNop(buf + offset2);
-}
-
-// Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal)
-// load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB,
-// as ldr can encode a signed 19-bit offset that gets multiplied by 4.
-//
-//   adrp xN, _foo@PAGE
-//   ldr  xM, [xN, _foo@PAGEOFF]
-// ->
-//   nop
-//   ldr  xM, _foo
-static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec,
-                         uint64_t offset1, uint64_t offset2) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  Adrp adrp;
-  Ldr ldr;
-  if (!parseAdrp(ins1, adrp) || !parseLdr(ins2, ldr))
-    return;
-  if (adrp.destRegister != ldr.baseRegister)
-    return;
-
-  uint64_t addr1 = isec->getVA() + offset1;
-  uint64_t addr2 = isec->getVA() + offset2;
-  uint64_t referent = pageBits(addr1) + adrp.addend + ldr.offset;
-  ldr.offset = referent - addr2;
-  if (!isLiteralLdrEligible(ldr))
-    return;
-
-  writeNop(buf + offset1);
-  writeLiteralLdr(buf + offset2, ldr);
-}
-
-// GOT loads are emitted by the compiler as a pair of adrp and ldr instructions,
-// but they may be changed to adrp+add by relaxGotLoad(). This hint performs
-// the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed.
-static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec,
-                            uint64_t offset1, uint64_t offset2) {
-  uint32_t ins2 = read32le(buf + offset2);
-  Add add;
-  Ldr ldr;
-  if (parseAdd(ins2, add))
-    applyAdrpAdd(buf, isec, offset1, offset2);
-  else if (parseLdr(ins2, ldr))
-    applyAdrpLdr(buf, isec, offset1, offset2);
-}
-
-// Optimizes an adrp+add+ldr sequence used for loading from a local symbol's
-// address by loading directly if it's close enough, or to an adrp(p)+ldr
-// sequence if it's not.
-//
-//   adrp x0, _foo@PAGE
-//   add  x1, x0, _foo@PAGEOFF
-//   ldr  x2, [x1, #off]
-static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec,
-                            uint64_t offset1, uint64_t offset2,
-                            uint64_t offset3) {
-  uint32_t ins1 = read32le(buf + offset1);
-  uint32_t ins2 = read32le(buf + offset2);
-  uint32_t ins3 = read32le(buf + offset3);
-  Adrp adrp;
-  Add add;
-  Ldr ldr;
-  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add) || !parseLdr(ins3, ldr))
-    return;
-  if (adrp.destRegister != add.srcRegister)
-    return;
-  if (add.destRegister != ldr.baseRegister)
-    return;
-
-  // Load from the target address directly.
-  //   nop
-  //   nop
-  //   ldr x2, [_foo + #off]
-  uint64_t addr1 = isec->getVA() + offset1;
-  uint64_t addr3 = isec->getVA() + offset3;
-  uint64_t referent = pageBits(addr1) + adrp.addend + add.addend;
-  Ldr literalLdr = ldr;
-  literalLdr.offset += referent - addr3;
-  if (isLiteralLdrEligible(literalLdr)) {
-    writeNop(buf + offset1);
-    writeNop(buf + offset2);
-    writeLiteralLdr(buf + offset3, literalLdr);
-    return;
-  }
-
-  if (applyAdrpAdd(buf, isec, offset1, offset2))
-    return;
-
-  // Move the target's page offset into the ldr's immediate offset.
-  //   adrp x0, _foo@PAGE
-  //   nop
-  //   ldr x2, [x0, _foo@PAGEOFF + #off]
-  Ldr immediateLdr = ldr;
-  immediateLdr.baseRegister = adrp.destRegister;
-  immediateLdr.offset += add.addend;
-  if (isImmediateLdrEligible(immediateLdr)) {
-    writeNop(buf + offset2);
-    writeImmediateLdr(buf + offset3, immediateLdr);
-    return;
-  }
-}
-
-// Relaxes a GOT-indirect load.
-// If the referenced symbol is external and its GOT entry is within +/- 1 MiB,
-// the GOT entry can be loaded with a single literal ldr instruction.
-// If the referenced symbol is local and thus has been relaxed to adrp+add+ldr,
-// we perform the AdrpAddLdr transformation.
-static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec,
-                               uint64_t offset1, uint64_t offset2,
-                               uint64_t offset3) {
-  uint32_t ins2 = read32le(buf + offset2);
-  Add add;
-  Ldr ldr2;
-
-  if (parseAdd(ins2, add)) {
-    applyAdrpAddLdr(buf, isec, offset1, offset2, offset3);
-  } else if (parseLdr(ins2, ldr2)) {
-    // adrp x1, _foo@GOTPAGE
-    // ldr  x2, [x1, _foo@GOTPAGEOFF]
-    // ldr  x3, [x2, #off]
-    uint32_t ins3 = read32le(buf + offset3);
-    Ldr ldr3;
-    if (!parseLdr(ins3, ldr3))
-      return;
-    if (ldr3.baseRegister != ldr2.destRegister)
-      return;
-    // Loads from the GOT must be pointer sized.
-    if (ldr2.p2Size != 3 || ldr2.isFloat)
-      return;
-    applyAdrpLdr(buf, isec, offset1, offset2);
-  }
-}
-
-template <typename Callback>
-static void forEachHint(ArrayRef<uint8_t> data, Callback callback) {
-  std::array<uint64_t, 3> args;
-
-  auto readNext = [&]() -> uint64_t {
-    unsigned int n = 0;
-    uint64_t value = decodeULEB128(data.data(), &n, data.end());
-    data = data.drop_front(n);
-    return value;
-  };
-
-  while (!data.empty()) {
-    uint64_t type = readNext();
-    if (type == 0)
-      break;
-
-    uint64_t argCount = readNext();
-    for (unsigned i = 0; i < argCount; ++i) {
-      uint64_t arg = readNext();
-      if (i < 3)
-        args[i] = arg;
-    }
-    // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others.
-    if (argCount > 3)
-      continue;
-    callback(type, ArrayRef(args.data(), argCount));
-  }
-}
-
-// On RISC architectures like arm64, materializing a memory address generally
-// takes multiple instructions. If the referenced symbol is located close enough
-// in memory, fewer instructions are needed.
-//
-// Linker optimization hints record where addresses are computed. After
-// addresses have been assigned, if possible, we change them to a shorter
-// sequence of instructions. The size of the binary is not modified; the
-// eliminated instructions are replaced with NOPs. This still leads to faster
-// code as the CPU can skip over NOPs quickly.
-//
-// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which
-// points to a sequence of ULEB128-encoded numbers. Each entry specifies a
-// transformation kind, and 2 or 3 addresses where the instructions are located.
-void ARM64::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) const {
-  ArrayRef<uint8_t> data = obj.getOptimizationHints();
-  if (data.empty())
-    return;
-
-  const ConcatInputSection *section = nullptr;
-  uint64_t sectionAddr = 0;
-  uint8_t *buf = nullptr;
-
-  auto findSection = [&](uint64_t addr) {
-    if (section && addr >= sectionAddr &&
-        addr < sectionAddr + section->getSize())
-      return true;
-
-    if (obj.sections.empty())
-      return false;
-    auto secIt = std::prev(llvm::upper_bound(
-        obj.sections, addr,
-        [](uint64_t off, const Section *sec) { return off < sec->addr; }));
-    const Section *sec = *secIt;
-
-    if (sec->subsections.empty())
-      return false;
-    auto subsecIt = std::prev(llvm::upper_bound(
-        sec->subsections, addr - sec->addr,
-        [](uint64_t off, Subsection subsec) { return off < subsec.offset; }));
-    const Subsection &subsec = *subsecIt;
-    const ConcatInputSection *isec =
-        dyn_cast_or_null<ConcatInputSection>(subsec.isec);
-    if (!isec || isec->shouldOmitFromOutput())
-      return false;
-
-    section = isec;
-    sectionAddr = subsec.offset + sec->addr;
-    buf = outBuf + section->outSecOff + section->parent->fileOff;
-    return true;
-  };
-
-  auto isValidOffset = [&](uint64_t offset) {
-    if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) {
-      error(toString(&obj) +
-            ": linker optimization hint spans multiple sections");
-      return false;
-    }
-    return true;
-  };
-
-  bool hasAdrpAdrp = false;
-  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
-    if (kind == LOH_ARM64_ADRP_ADRP) {
-      hasAdrpAdrp = true;
-      return;
-    }
-
-    if (!findSection(args[0]))
-      return;
-    switch (kind) {
-    case LOH_ARM64_ADRP_ADD:
-      if (isValidOffset(args[1]))
-        applyAdrpAdd(buf, section, args[0] - sectionAddr,
-                     args[1] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_LDR:
-      if (isValidOffset(args[1]))
-        applyAdrpLdr(buf, section, args[0] - sectionAddr,
-                     args[1] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_LDR_GOT:
-      if (isValidOffset(args[1]))
-        applyAdrpLdrGot(buf, section, args[0] - sectionAddr,
-                        args[1] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_ADD_LDR:
-      if (isValidOffset(args[1]) && isValidOffset(args[2]))
-        applyAdrpAddLdr(buf, section, args[0] - sectionAddr,
-                        args[1] - sectionAddr, args[2] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_LDR_GOT_LDR:
-      if (isValidOffset(args[1]) && isValidOffset(args[2]))
-        applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr,
-                           args[1] - sectionAddr, args[2] - sectionAddr);
-      break;
-    case LOH_ARM64_ADRP_ADD_STR:
-    case LOH_ARM64_ADRP_LDR_GOT_STR:
-      // TODO: Implement these
-      break;
-    }
-  });
-
-  if (!hasAdrpAdrp)
-    return;
-
-  // AdrpAdrp optimization hints are performed in a second pass because they
-  // might interfere with other transformations. For instance, consider the
-  // following input:
-  //
-  //   adrp x0, _foo@PAGE
-  //   add  x1, x0, _foo@PAGEOFF
-  //   adrp x0, _bar@PAGE
-  //   add  x2, x0, _bar@PAGEOFF
-  //
-  // If we perform the AdrpAdrp relaxation first, we get:
-  //
-  //   adrp x0, _foo@PAGE
-  //   add  x1, x0, _foo@PAGEOFF
-  //   nop
-  //   add x2, x0, _bar@PAGEOFF
-  //
-  // If we then apply AdrpAdd to the first two instructions, the add will have a
-  // garbage value in x0:
-  //
-  //   adr  x1, _foo
-  //   nop
-  //   nop
-  //   add  x2, x0, _bar@PAGEOFF
-  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
-    if (kind != LOH_ARM64_ADRP_ADRP)
-      return;
-    if (!findSection(args[0]))
-      return;
-    if (isValidOffset(args[1]))
-      applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr);
-  });
-}
-
 TargetInfo *macho::createARM64TargetInfo() {
   static ARM64 t;
   return &t;
diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
index ecf6ce609e59f..3cd94ced75cc0 100644
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -18,6 +18,7 @@ add_lld_library(lldMachO
   ICF.cpp
   InputFiles.cpp
   InputSection.cpp
+  LinkerOptimizationHints.cpp
   LTO.cpp
   MapFile.cpp
   MarkLive.cpp
diff --git a/lld/MachO/LinkerOptimizationHints.cpp b/lld/MachO/LinkerOptimizationHints.cpp
new file mode 100644
index 0000000000000..bae1a576eea57
--- /dev/null
+++ b/lld/MachO/LinkerOptimizationHints.cpp
@@ -0,0 +1,523 @@
+//===- LinkerOptimizationHints.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LinkerOptimizationHints.h"
+
+#include "Arch/ARM64Common.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+using namespace llvm::support::endian;
+using namespace lld;
+using namespace lld::macho;
+
+namespace {
+struct Adrp {
+  uint32_t destRegister;
+  int64_t addend;
+};
+
+struct Add {
+  uint8_t destRegister;
+  uint8_t srcRegister;
+  uint32_t addend;
+};
+
+enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 };
+
+struct Ldr {
+  uint8_t destRegister;
+  uint8_t baseRegister;
+  uint8_t p2Size;
+  bool isFloat;
+  ExtendType extendType;
+  int64_t offset;
+};
+} // namespace
+
+static bool parseAdrp(uint32_t insn, Adrp &adrp) {
+  if ((insn & 0x9f000000) != 0x90000000)
+    return false;
+  adrp.destRegister = insn & 0x1f;
+  uint64_t immHi = (insn >> 5) & 0x7ffff;
+  uint64_t immLo = (insn >> 29) & 0x3;
+  adrp.addend = SignExtend64<21>(immLo | (immHi << 2)) * 4096;
+  return true;
+}
+
+static bool parseAdd(uint32_t insn, Add &add) {
+  if ((insn & 0xffc00000) != 0x91000000)
+    return false;
+  add.destRegister = insn & 0x1f;
+  add.srcRegister = (insn >> 5) & 0x1f;
+  add.addend = (insn >> 10) & 0xfff;
+  return true;
+}
+
+static bool parseLdr(uint32_t insn, Ldr &ldr) {
+  ldr.destRegister = insn & 0x1f;
+  ldr.baseRegister = (insn >> 5) & 0x1f;
+  uint8_t size = insn >> 30;
+  uint8_t opc = (insn >> 22) & 3;
+
+  if ((insn & 0x3fc00000) == 0x39400000) {
+    // LDR (immediate), LDRB (immediate), LDRH (immediate)
+    ldr.p2Size = size;
+    ldr.extendType = ZeroExtend;
+    ldr.isFloat = false;
+  } else if ((insn & 0x3f800000) == 0x39800000) {
+    // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate)
+    ldr.p2Size = size;
+    ldr.extendType = static_cast<ExtendType>(opc);
+    ldr.isFloat = false;
+  } else if ((insn & 0x3f400000) == 0x3d400000) {
+    // LDR (immediate, SIMD&FP)
+    ldr.extendType = ZeroExtend;
+    ldr.isFloat = true;
+    if (opc == 1)
+      ldr.p2Size = size;
+    else if (size == 0 && opc == 3)
+      ldr.p2Size = 4;
+    else
+      return false;
+  } else {
+    return false;
+  }
+  ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size;
+  return true;
+}
+
+static bool isValidAdrOffset(int32_t delta) { return isInt<21>(delta); }
+
+static void writeAdr(void *loc, uint32_t dest, int32_t delta) {
+  assert(isValidAdrOffset(delta));
+  uint32_t opcode = 0x10000000;
+  uint32_t immHi = (delta & 0x001ffffc) << 3;
+  uint32_t immLo = (delta & 0x00000003) << 29;
+  write32le(loc, opcode | immHi | immLo | dest);
+}
+
+static void writeNop(void *loc) { write32le(loc, 0xd503201f); }
+
+static bool isLiteralLdrEligible(const Ldr &ldr) {
+  return ldr.p2Size > 1 && isShiftedInt<19, 2>(ldr.offset);
+}
+
+static void writeLiteralLdr(void *loc, const Ldr &ldr) {
+  assert(isLiteralLdrEligible(ldr));
+  uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes<uint32_t>(19)) << 5;
+  uint32_t opcode;
+  switch (ldr.p2Size) {
+  case 2:
+    if (ldr.isFloat)
+      opcode = 0x1c000000;
+    else
+      opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000;
+    break;
+  case 3:
+    opcode = ldr.isFloat ? 0x5c000000 : 0x58000000;
+    break;
+  case 4:
+    opcode = 0x9c000000;
+    break;
+  default:
+    llvm_unreachable("Invalid literal ldr size");
+  }
+  write32le(loc, opcode | imm19 | ldr.destRegister);
+}
+
+static bool isImmediateLdrEligible(const Ldr &ldr) {
+  // Note: We deviate from ld64's behavior, which converts to immediate loads
+  // only if ldr.offset < 4096, even though the offset is divided by the load's
+  // size in the 12-bit immediate operand. Only the unsigned offset variant is
+  // supported.
+
+  uint32_t size = 1 << ldr.p2Size;
+  return ldr.offset >= 0 && (ldr.offset % size) == 0 &&
+         isUInt<12>(ldr.offset >> ldr.p2Size);
+}
+
+static void writeImmediateLdr(void *loc, const Ldr &ldr) {
+  assert(isImmediateLdrEligible(ldr));
+  uint32_t opcode = 0x39000000;
+  if (ldr.isFloat) {
+    opcode |= 0x04000000;
+    assert(ldr.extendType == ZeroExtend);
+  }
+  opcode |= ldr.destRegister;
+  opcode |= ldr.baseRegister << 5;
+  uint8_t size, opc;
+  if (ldr.p2Size == 4) {
+    size = 0;
+    opc = 3;
+  } else {
+    opc = ldr.extendType;
+    size = ldr.p2Size;
+  }
+  uint32_t immBits = ldr.offset >> ldr.p2Size;
+  write32le(loc, opcode | (immBits << 10) | (opc << 22) | (size << 30));
+}
+
+// Transforms a pair of adrp+add instructions into an adr instruction if the
+// target is within the +/- 1 MiB range allowed by the adr's 21 bit signed
+// immediate offset.
+//
+//   adrp xN, _foo@PAGE
+//   add  xM, xN, _foo@PAGEOFF
+// ->
+//   adr  xM, _foo
+//   nop
+static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  Adrp adrp;
+  Add add;
+  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add))
+    return false;
+  if (adrp.destRegister != add.srcRegister)
+    return false;
+
+  uint64_t addr1 = isec->getVA() + offset1;
+  uint64_t referent = lld::macho::pageBits(addr1) + adrp.addend + add.addend;
+  int64_t delta = referent - addr1;
+  if (!isValidAdrOffset(delta))
+    return false;
+
+  writeAdr(buf + offset1, add.destRegister, delta);
+  writeNop(buf + offset2);
+  return true;
+}
+
+// Transforms two adrp instructions into a single adrp if their referent
+// addresses are located on the same 4096 byte page.
+//
+//   adrp xN, _foo@PAGE
+//   adrp xN, _bar@PAGE
+// ->
+//   adrp xN, _foo@PAGE
+//   nop
+static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec,
+                          uint64_t offset1, uint64_t offset2) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  Adrp adrp1, adrp2;
+  if (!parseAdrp(ins1, adrp1) || !parseAdrp(ins2, adrp2))
+    return;
+  if (adrp1.destRegister != adrp2.destRegister)
+    return;
+
+  uint64_t page1 = pageBits(offset1 + isec->getVA()) + adrp1.addend;
+  uint64_t page2 = pageBits(offset2 + isec->getVA()) + adrp2.addend;
+  if (page1 != page2)
+    return;
+
+  writeNop(buf + offset2);
+}
+
+// Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal)
+// load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB,
+// as ldr can encode a signed 19-bit offset that gets multiplied by 4.
+//
+//   adrp xN, _foo@PAGE
+//   ldr  xM, [xN, _foo@PAGEOFF]
+// ->
+//   nop
+//   ldr  xM, _foo
+static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  Adrp adrp;
+  Ldr ldr;
+  if (!parseAdrp(ins1, adrp) || !parseLdr(ins2, ldr))
+    return;
+  if (adrp.destRegister != ldr.baseRegister)
+    return;
+
+  uint64_t addr1 = isec->getVA() + offset1;
+  uint64_t addr2 = isec->getVA() + offset2;
+  uint64_t referent = pageBits(addr1) + adrp.addend + ldr.offset;
+  ldr.offset = referent - addr2;
+  if (!isLiteralLdrEligible(ldr))
+    return;
+
+  writeNop(buf + offset1);
+  writeLiteralLdr(buf + offset2, ldr);
+}
+
+// GOT loads are emitted by the compiler as a pair of adrp and ldr instructions,
+// but they may be changed to adrp+add by relaxGotLoad(). This hint performs
+// the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed.
+static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2) {
+  uint32_t ins2 = read32le(buf + offset2);
+  Add add;
+  Ldr ldr;
+  if (parseAdd(ins2, add))
+    applyAdrpAdd(buf, isec, offset1, offset2);
+  else if (parseLdr(ins2, ldr))
+    applyAdrpLdr(buf, isec, offset1, offset2);
+}
+
+// Optimizes an adrp+add+ldr sequence used for loading from a local symbol's
+// address by loading directly if it's close enough, or to an adrp(p)+ldr
+// sequence if it's not.
+//
+//   adrp x0, _foo@PAGE
+//   add  x1, x0, _foo@PAGEOFF
+//   ldr  x2, [x1, #off]
+static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2,
+                            uint64_t offset3) {
+  uint32_t ins1 = read32le(buf + offset1);
+  uint32_t ins2 = read32le(buf + offset2);
+  uint32_t ins3 = read32le(buf + offset3);
+  Adrp adrp;
+  Add add;
+  Ldr ldr;
+  if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add) || !parseLdr(ins3, ldr))
+    return;
+  if (adrp.destRegister != add.srcRegister)
+    return;
+  if (add.destRegister != ldr.baseRegister)
+    return;
+
+  // Load from the target address directly.
+  //   nop
+  //   nop
+  //   ldr x2, [_foo + #off]
+  uint64_t addr1 = isec->getVA() + offset1;
+  uint64_t addr3 = isec->getVA() + offset3;
+  uint64_t referent = pageBits(addr1) + adrp.addend + add.addend;
+  Ldr literalLdr = ldr;
+  literalLdr.offset += referent - addr3;
+  if (isLiteralLdrEligible(literalLdr)) {
+    writeNop(buf + offset1);
+    writeNop(buf + offset2);
+    writeLiteralLdr(buf + offset3, literalLdr);
+    return;
+  }
+
+  if (applyAdrpAdd(buf, isec, offset1, offset2))
+    return;
+
+  // Move the target's page offset into the ldr's immediate offset.
+  //   adrp x0, _foo@PAGE
+  //   nop
+  //   ldr x2, [x0, _foo@PAGEOFF + #off]
+  Ldr immediateLdr = ldr;
+  immediateLdr.baseRegister = adrp.destRegister;
+  immediateLdr.offset += add.addend;
+  if (isImmediateLdrEligible(immediateLdr)) {
+    writeNop(buf + offset2);
+    writeImmediateLdr(buf + offset3, immediateLdr);
+    return;
+  }
+}
+
+// Relaxes a GOT-indirect load.
+// If the referenced symbol is external and its GOT entry is within +/- 1 MiB,
+// the GOT entry can be loaded with a single literal ldr instruction.
+// If the referenced symbol is local and thus has been relaxed to adrp+add+ldr,
+// we perform the AdrpAddLdr transformation.
+static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec,
+                               uint64_t offset1, uint64_t offset2,
+                               uint64_t offset3) {
+  uint32_t ins2 = read32le(buf + offset2);
+  Add add;
+  Ldr ldr2;
+
+  if (parseAdd(ins2, add)) {
+    applyAdrpAddLdr(buf, isec, offset1, offset2, offset3);
+  } else if (parseLdr(ins2, ldr2)) {
+    // adrp x1, _foo@GOTPAGE
+    // ldr  x2, [x1, _foo@GOTPAGEOFF]
+    // ldr  x3, [x2, #off]
+    uint32_t ins3 = read32le(buf + offset3);
+    Ldr ldr3;
+    if (!parseLdr(ins3, ldr3))
+      return;
+    if (ldr3.baseRegister != ldr2.destRegister)
+      return;
+    applyAdrpLdr(buf, isec, offset1, offset2);
+  }
+}
+
+template <typename Callback>
+static void forEachHint(ArrayRef<uint8_t> data, Callback callback) {
+  std::array<uint64_t, 3> args;
+
+  auto readNext = [&]() -> uint64_t {
+    unsigned int n = 0;
+    uint64_t value = decodeULEB128(data.data(), &n, data.end());
+    data = data.drop_front(n);
+    return value;
+  };
+
+  while (!data.empty()) {
+    uint64_t type = readNext();
+    if (type == 0)
+      break;
+
+    uint64_t argCount = readNext();
+    for (unsigned i = 0; i < argCount; ++i) {
+      uint64_t arg = readNext();
+      if (i < 3)
+        args[i] = arg;
+    }
+    // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others.
+    if (argCount > 3)
+      continue;
+    callback(type, ArrayRef(args.data(), argCount));
+  }
+}
+
+// On RISC architectures like arm64, materializing a memory address generally
+// takes multiple instructions. If the referenced symbol is located close enough
+// in memory, fewer instructions are needed.
+//
+// Linker optimization hints record where addresses are computed. After
+// addresses have been assigned, if possible, we change them to a shorter
+// sequence of instructions. The size of the binary is not modified; the
+// eliminated instructions are replaced with NOPs. This still leads to faster
+// code as the CPU can skip over NOPs quickly.
+//
+// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which
+// points to a sequence of ULEB128-encoded numbers. Each entry specifies a
+// transformation kind, and 2 or 3 addresses where the instructions are located.
+void macho::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) {
+  ArrayRef<uint8_t> data = obj.getOptimizationHints();
+  if (data.empty())
+    return;
+
+  const ConcatInputSection *section = nullptr;
+  uint64_t sectionAddr = 0;
+  uint8_t *buf = nullptr;
+
+  auto findSection = [&](uint64_t addr) {
+    if (section && addr >= sectionAddr &&
+        addr < sectionAddr + section->getSize())
+      return true;
+
+    if (obj.sections.empty())
+      return false;
+    auto secIt = std::prev(llvm::upper_bound(
+        obj.sections, addr,
+        [](uint64_t off, const Section *sec) { return off < sec->addr; }));
+    const Section *sec = *secIt;
+
+    if (sec->subsections.empty())
+      return false;
+    auto subsecIt = std::prev(llvm::upper_bound(
+        sec->subsections, addr - sec->addr,
+        [](uint64_t off, Subsection subsec) { return off < subsec.offset; }));
+    const Subsection &subsec = *subsecIt;
+    const ConcatInputSection *isec =
+        dyn_cast_or_null<ConcatInputSection>(subsec.isec);
+    if (!isec || isec->shouldOmitFromOutput())
+      return false;
+
+    section = isec;
+    sectionAddr = subsec.offset + sec->addr;
+    buf = outBuf + section->outSecOff + section->parent->fileOff;
+    return true;
+  };
+
+  auto isValidOffset = [&](uint64_t offset) {
+    if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) {
+      error(toString(&obj) +
+            ": linker optimization hint spans multiple sections");
+      return false;
+    }
+    return true;
+  };
+
+  bool hasAdrpAdrp = false;
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind == LOH_ARM64_ADRP_ADRP) {
+      hasAdrpAdrp = true;
+      return;
+    }
+
+    if (!findSection(args[0]))
+      return;
+    switch (kind) {
+    case LOH_ARM64_ADRP_ADD:
+      if (isValidOffset(args[1]))
+        applyAdrpAdd(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR:
+      if (isValidOffset(args[1]))
+        applyAdrpLdr(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR_GOT:
+      if (isValidOffset(args[1]))
+        applyAdrpLdrGot(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_ADD_LDR:
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpAddLdr(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr, args[2] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR_GOT_LDR:
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr,
+                           args[1] - sectionAddr, args[2] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_ADD_STR:
+    case LOH_ARM64_ADRP_LDR_GOT_STR:
+      // TODO: Implement these
+      break;
+    }
+  });
+
+  if (!hasAdrpAdrp)
+    return;
+
+  // AdrpAdrp optimization hints are performed in a second pass because they
+  // might interfere with other transformations. For instance, consider the
+  // following input:
+  //
+  //   adrp x0, _foo@PAGE
+  //   add  x1, x0, _foo@PAGEOFF
+  //   adrp x0, _bar@PAGE
+  //   add  x2, x0, _bar@PAGEOFF
+  //
+  // If we perform the AdrpAdrp relaxation first, we get:
+  //
+  //   adrp x0, _foo@PAGE
+  //   add  x1, x0, _foo@PAGEOFF
+  //   nop
+  //   add x2, x0, _bar@PAGEOFF
+  //
+  // If we then apply AdrpAdd to the first two instructions, the add will have a
+  // garbage value in x0:
+  //
+  //   adr  x1, _foo
+  //   nop
+  //   nop
+  //   add  x2, x0, _bar@PAGEOFF
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind != LOH_ARM64_ADRP_ADRP)
+      return;
+    if (!findSection(args[0]))
+      return;
+    if (isValidOffset(args[1]))
+      applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr);
+  });
+}
diff --git a/lld/MachO/LinkerOptimizationHints.h b/lld/MachO/LinkerOptimizationHints.h
new file mode 100644
index 0000000000000..eada9b048c255
--- /dev/null
+++ b/lld/MachO/LinkerOptimizationHints.h
@@ -0,0 +1,17 @@
+//===- LinkerOptimizationHints.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_MACHO_LINKER_OPTIMIZATION_HINTS_H
+#define LLD_MACHO_LINKER_OPTIMIZATION_HINTS_H
+
+#include "InputFiles.h"
+
+namespace lld::macho {
+void applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj);
+}
+#endif
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index 39f5f94078611..27e5178593c87 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -124,8 +124,6 @@ class TargetInfo {
     llvm_unreachable("Unsupported architecture for dtrace symbols");
   }
 
-  virtual void applyOptimizationHints(uint8_t *, const ObjFile &) const {};
-
   uint32_t magic;
   llvm::MachO::CPUType cpuType;
   uint32_t cpuSubtype;
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index 613e6dea3b897..f288fadc0d14f 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -11,6 +11,7 @@
 #include "Config.h"
 #include "InputFiles.h"
 #include "InputSection.h"
+#include "LinkerOptimizationHints.h"
 #include "MapFile.h"
 #include "OutputSection.h"
 #include "OutputSegment.h"
@@ -1209,14 +1210,15 @@ void Writer::writeSections() {
 }
 
 void Writer::applyOptimizationHints() {
-  if (config->arch() != AK_arm64 || config->ignoreOptimizationHints)
+  if (!is_contained({AK_arm64, AK_arm64e, AK_arm64_32}, config->arch()) ||
+      config->ignoreOptimizationHints)
     return;
 
   uint8_t *buf = buffer->getBufferStart();
   TimeTraceScope timeScope("Apply linker optimization hints");
   parallelForEach(inputFiles, [buf](const InputFile *file) {
     if (const auto *objFile = dyn_cast<ObjFile>(file))
-      target->applyOptimizationHints(buf, *objFile);
+      macho::applyOptimizationHints(buf, *objFile);
   });
 }
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 69ed57d67615c..863b201891006 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -70,6 +70,10 @@ ELF Improvements
   not provided in the command line by the user and cannot be inferred from
   inputs.
 * For LoongArch, the initial-exec to local-exec TLS optimization has been implemented.
+* For LoongArch, several relaxation optimizations are supported, including relaxation for
+  ``R_LARCH_PCALA_HI20/LO12`` and ``R_LARCH_GOT_PC_HI20/LO12`` relocations, instruction
+  relaxation for ``R_LARCH_CALL36``, TLS local-exec (``LE``)/global dynamic (``GD``)/
+  local dynamic (``LD``) model relaxation, and TLSDESC code sequence relaxation.
 * For RISCV, an oscillation bug due to call relaxation is now fixed.
   (`#142899 <https://github.com/llvm/llvm-project/pull/142899>`_)
 * For x86-64, the ``.ltext`` section is now placed before ``.rodata``.
diff --git a/lld/test/ELF/hexagon-attributes.s b/lld/test/ELF/hexagon-attributes.s
new file mode 100644
index 0000000000000..eee820361327f
--- /dev/null
+++ b/lld/test/ELF/hexagon-attributes.s
@@ -0,0 +1,150 @@
+# REQUIRES: hexagon
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf a.s -o a.o
+# RUN: ld.lld -e 0 a.o -o out 2>&1 | count 0
+# RUN: llvm-readelf -S -l --arch-specific out | FileCheck %s --check-prefixes=HDR,CHECK
+# RUN: ld.lld -e 0 a.o a.o -o out1 2>&1 | count 0
+# RUN: llvm-readobj --arch-specific out1 | FileCheck %s
+# RUN: ld.lld -r a.o a.o -o out1 2>&1 | count 0
+# RUN: llvm-readobj --arch-specific out1 | FileCheck %s
+
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf b.s -o b.o
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf c.s -o c.o
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf d.s -o d.o
+# RUN: ld.lld a.o b.o c.o -o out2
+# RUN: llvm-readobj --arch-specific out2 | FileCheck %s --check-prefix=CHECK2
+# RUN: ld.lld a.o b.o c.o d.o -o out3
+# RUN: llvm-readobj --arch-specific out3 | FileCheck %s --check-prefix=CHECK3
+
+# HDR:      Name                Type               Address  Off    Size   ES Flg Lk Inf Al
+# HDR:      .hexagon.attributes HEXAGON_ATTRIBUTES 00000000 {{.*}} {{.*}} 00     0   0  1{{$}}
+
+# HDR:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# HDR:      LOAD           {{.*}}
+# HDR-NEXT: GNU_STACK      {{.*}}
+
+# CHECK:      BuildAttributes {
+# CHECK-NEXT:   FormatVersion: 0x41
+# CHECK-NEXT:   Section 1 {
+# CHECK-NEXT:     SectionLength: 19
+# CHECK-NEXT:     Vendor: hexagon
+# CHECK-NEXT:     Tag: Tag_File (0x1)
+# CHECK-NEXT:     Size: 7
+# CHECK-NEXT:     FileAttributes {
+# CHECK-NEXT:       Attribute {
+# CHECK-NEXT:         Tag: 4
+# CHECK-NEXT:         TagName: arch
+# CHECK-NEXT:         Value: 68{{$}}
+# CHECK-NEXT:       }
+# CHECK-NEXT:     }
+# CHECK-NEXT:   }
+# CHECK-NEXT: }
+
+# CHECK2:      BuildAttributes {
+# CHECK2-NEXT:   FormatVersion: 0x41
+# CHECK2-NEXT:   Section 1 {
+# CHECK2-NEXT:     SectionLength: 21
+# CHECK2-NEXT:     Vendor: hexagon
+# CHECK2-NEXT:     Tag: Tag_File (0x1)
+# CHECK2-NEXT:     Size: 9
+# CHECK2-NEXT:     FileAttributes {
+# CHECK2-NEXT:       Attribute {
+# CHECK2-NEXT:         Tag: 4
+# CHECK2-NEXT:         TagName: arch
+# CHECK2-NEXT:         Value: 68{{$}}
+# CHECK2-NEXT:       }
+# CHECK2-NEXT:       Attribute {
+# CHECK2-NEXT:         Tag: 5
+# CHECK2-NEXT:         TagName: hvx_arch
+# CHECK2-NEXT:         Value: 68{{$}}
+# CHECK2-NEXT:       }
+# CHECK2-NEXT:     }
+# CHECK2-NEXT:   }
+# CHECK2-NEXT: }
+
+# CHECK3:      BuildAttributes {
+# CHECK3-NEXT:   FormatVersion: 0x41
+# CHECK3-NEXT:   Section 1 {
+# CHECK3-NEXT:     SectionLength: 25
+# CHECK3-NEXT:     Vendor: hexagon
+# CHECK3-NEXT:     Tag: Tag_File (0x1)
+# CHECK3-NEXT:     Size: 13
+# CHECK3-NEXT:     FileAttributes {
+# CHECK3-NEXT:       Attribute {
+# CHECK3-NEXT:         Tag: 7
+# CHECK3-NEXT:         TagName: hvx_qfloat
+# CHECK3-NEXT:         Value: 68{{$}}
+# CHECK3-NEXT:       }
+# CHECK3-NEXT:       Attribute {
+# CHECK3-NEXT:         Tag: 9
+# CHECK3-NEXT:         TagName: audio
+# CHECK3-NEXT:         Value: 68{{$}}
+# CHECK3-NEXT:       }
+# CHECK3-NEXT:       Attribute {
+# CHECK3-NEXT:         Tag: 4
+# CHECK3-NEXT:         TagName: arch
+# CHECK3-NEXT:         Value: 68{{$}}
+# CHECK3-NEXT:       }
+# CHECK3-NEXT:       Attribute {
+# CHECK3-NEXT:         Tag: 5
+# CHECK3-NEXT:         TagName: hvx_arch
+# CHECK3-NEXT:         Value: 68{{$}}
+# CHECK3-NEXT:       }
+# CHECK3-NEXT:     }
+# CHECK3-NEXT:   }
+# CHECK3-NEXT: }
+
+#--- a.s
+.section .hexagon.attributes,"",@0x70000003
+.byte 0x41
+.long .Lend-.hexagon.attributes-1
+.asciz "hexagon"
+.Lbegin:
+.byte 1
+.long .Lend-.Lbegin
+.byte 4
+.byte 68
+.Lend:
+
+#--- b.s
+.section .hexagon.attributes,"",@0x70000003
+.byte 0x41
+.long .Lend1-.hexagon.attributes-1
+.asciz "hexagon"
+.Lbegin1:
+.byte 1
+.long .Lend1-.Lbegin1
+.byte 4
+.byte 68
+.Lend1:
+
+#--- c.s
+.section .hexagon.attributes,"",@0x70000003
+.byte 0x41
+.long .Lend2-.hexagon.attributes-1
+.asciz "hexagon"
+.Lbegin2:
+.byte 1
+.long .Lend2-.Lbegin2
+.byte 4
+.byte 68
+.byte 5
+.byte 68
+.Lend2:
+
+#--- d.s
+.section .hexagon.attributes,"",@0x70000003
+.byte 0x41
+.long .Lend3-.hexagon.attributes-1
+.asciz "hexagon"
+.Lbegin3:
+.byte 1
+.long .Lend3-.Lbegin3
+.byte 4
+.byte 68
+.byte 7
+.byte 68
+.byte 9
+.byte 68
+.Lend3:
diff --git a/lld/test/MachO/loh-arm64-32.s b/lld/test/MachO/loh-arm64-32.s
new file mode 100644
index 0000000000000..906d0e1ce9046
--- /dev/null
+++ b/lld/test/MachO/loh-arm64-32.s
@@ -0,0 +1,64 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=arm64_32-apple-watchos %s -o %t.o
+# RUN: %lld-watchos -U _external %t.o -o %t
+# RUN: llvm-objdump -d --macho %t | FileCheck %s
+
+.text
+.align 2
+.globl _foo
+_foo:
+    ret
+.globl _bar
+_bar:
+    ret
+
+.globl _main
+_main:
+# CHECK-LABEL: _main:
+
+L1: adrp x0, _foo@PAGE
+L2: add  x0, x0, _foo@PAGEOFF
+# CHECK-NEXT: adr x0
+# CHECK-NEXT: nop
+
+L3: adrp x0, _ptr@PAGE
+L4: add  x1, x0, _ptr@PAGEOFF
+L5: ldr  x2, [x1]
+# CHECK-NEXT: nop
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr x2
+
+L6: adrp x0, _foo@PAGE
+L7: adrp x0, _bar@PAGE
+# CHECK-NEXT: adrp x0
+# CHECK-NEXT: nop
+
+L8: adrp x0, _ptr@PAGE
+L9: ldr  x0, [x0, _ptr@PAGEOFF]
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr x0
+
+L10: adrp x0, _ptr@PAGE
+L11: ldr  w0, [x0, _ptr@PAGEOFF]
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr w0, _ptr
+
+L12: adrp x0, _external@PAGE
+L13: ldr  w1, [x0, _external@PAGEOFF]
+L14: ldr  x2, [x1]
+# CHECK-NEXT: nop
+# CHECK-NEXT: ldr w1, 0x{{.*}}
+# CHECK-NEXT: ldr x2, [x1]
+
+.data
+.align 4
+_ptr:
+    .quad 0
+
+.loh AdrpAdd L1, L2
+.loh AdrpAddLdr L3, L4, L5
+.loh AdrpAdrp L6, L7
+.loh AdrpLdr L8, L9
+.loh AdrpLdrGot L10, L11
+.loh AdrpLdrGotLdr L12, L13, L14
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 480430fede928..4bbec891da0b3 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -211,6 +211,7 @@ Clang. Then we build the ``ALL`` target with ninja:
 
   $ cmake -B /path/to/llvm-build -G Ninja \
           -DLLVM_ENABLE_PROJECTS=clang \
+          -DCMAKE_BUILD_TYPE=Release \
           [<more cmake options>] /path/to/llvm-project/llvm
   $ ninja
 
@@ -224,6 +225,7 @@ build directory for Clang, remember to pass its module path via ``Clang_DIR``
 ::
 
   $ cmake -B /path/to/lldb-build -G Ninja \
+          -DCMAKE_BUILD_TYPE=Release \
           -DLLVM_DIR=/path/to/llvm-build/lib/cmake/llvm \
           [<more cmake options>] /path/to/llvm-project/lldb
   $ ninja lldb lldb-server
diff --git a/lldb/docs/resources/qemu-testing.rst b/lldb/docs/resources/qemu-testing.rst
index 8571287a04262..90b8fd50cb5c4 100644
--- a/lldb/docs/resources/qemu-testing.rst
+++ b/lldb/docs/resources/qemu-testing.rst
@@ -167,3 +167,50 @@ The result of this is that:
 
 Your VM configuration should have ports ``54321`` and ``49140`` forwarded for
 this to work.
+
+QEMU user mode emulation
+------------------------
+
+Serious testing of LLDB should be done using system mode emulation. The following
+is presented for information only and is not a supported testing configuration
+supported by the LLDB project.
+
+However, it is possible to run the test suite against user mode QEMU if you just
+want to test a specific aspect of ``lldb`` and are ok ignoring a lot of expected
+failures. This method can also be adapted for simulators with a qemu-like command
+line interface.
+
+(``lldb-server`` cannot be tested using user mode QEMU because that does not
+emulate the debugging system calls that ``lldb-server`` tries to make)
+
+Change ``LLDB_TEST_USER_ARGS`` to choose the ``qemu-user`` platform and
+configure it for your architecture. The example below is for AArch64 and assumes
+that ``qemu-aarch64`` is installed and on your path.
+
+If you need to override how the ``qemu-user`` platform finds the QEMU binary,
+look up the rest of the platform's settings in LLDB.
+
+::
+
+  -DLLDB_TEST_USER_ARGS="--platform-name;qemu-user;--setting;platform.plugin.qemu-user.architecture=aarch64;--arch;aarch64"
+
+Also set ``LLDB_TEST_COMPILER`` to something that can target the emulated
+architecture. Then you should be able to run ``ninja check-lldb`` and it will
+run the tests on QEMU user automatically.
+
+You will see a number of failures compared to a normal test run. Reasons for
+this can be, but are not limited to:
+
+* QEMU's built-in debug stub acting differently and supporting different
+  features to different extents, when compared to ``lldb-server``. We try to
+  be compatible but LLDB is not regularly tested with QEMU user.
+
+* Tests that spawn new processes to attach to. QEMU user only emulates a single
+  process.
+
+* Watchpoints. Either these are not emulated or behave differently to real
+  hardware. Add ``--skip-category;watchpoint`` to ``-DLLDB_TEST_USER_ARGS`` to
+  skip those.
+
+* Lack of memory region information due to QEMU communicating this in the
+  GDB server format which LLDB does not use.
diff --git a/lldb/docs/use/mcp.md b/lldb/docs/use/mcp.md
index 375c164fe771c..b7474246b54f3 100644
--- a/lldb/docs/use/mcp.md
+++ b/lldb/docs/use/mcp.md
@@ -75,7 +75,69 @@ Configuration example for [Visual Studio Code](https://code.visualstudio.com/doc
 }
 ```
 
-### Troubleshooting
+## Tools
+
+Tools are a primitive in the Model Context Protocol that enable servers to
+expose functionality to clients.
+
+LLDB's MCP integration exposes one tool, named `lldb_command` which allows the
+model to run the same commands a user would type in the LLDB command
+interpreter. It takes two arguments:
+
+1. The unique debugger ID as a number.
+2. The command and its arguments as a string.
+
+## Resources
+
+Resources are a primitive in the Model Context Protocol that allow servers to
+expose content that can be read by clients.
+
+LLDB's MCP integration exposes a resource for each debugger and target
+instance. Debugger resources are accessible using the following URI:
+
+```
+lldb://debugger/<debugger id>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1",
+      "mimeType": "application/json",
+      "text": "{\"debugger_id\":1,\"name\":\"debugger_1\",\"num_targets\":1}"
+    }
+  ]
+}
+```
+
+Debuggers can contain one or more targets, which are accessible using the
+following URI:
+
+```
+lldb://debugger/<debugger id>/target/<target idx>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1/target/0",
+      "mimeType": "application/json",
+      "text": "{\"arch\":\"arm64-apple-macosx26.0.0\",\"debugger_id\":1,\"dummy\":false,\"path\":\"/bin/count\",\"platform\":\"host\",\"selected\":true,\"target_idx\":0}"
+    }
+  ]
+}
+```
+
+Note that unlike the debugger id, which is unique, the target index is not
+stable and may be reused when a target is removed and a new target is added.
+
+## Troubleshooting
 
 The MCP server uses the `Host` log channel. You can enable logging with the
 `log enable` command.
diff --git a/lldb/examples/python/filter_disasm.py b/lldb/examples/python/filter_disasm.py
new file mode 100644
index 0000000000000..46c9f794b25a2
--- /dev/null
+++ b/lldb/examples/python/filter_disasm.py
@@ -0,0 +1,98 @@
+"""
+Defines a command, fdis, that does filtered disassembly. The command does the
+lldb disassemble command with -b and any other arguments passed in, and
+pipes that through a provided filter program.
+
+The intention is to support disassembly of RISC-V proprietary instructions.
+This is handled with llvm-objdump by piping the output of llvm-objdump through
+a filter program. This script is intended to mimic that workflow.
+"""
+
+import lldb
+import subprocess
+
+
+class Program(list):
+    def __str__(self):
+        return " ".join(self)
+
+
+filter_program = Program(["crustfilt"])
+
+def __lldb_init_module(debugger, dict):
+    debugger.HandleCommand("command script add -f filter_disasm.fdis fdis")
+    print("Disassembly filter command (fdis) loaded")
+    print("Filter program set to %s" % filter_program)
+
+
+def fdis(debugger, args, exe_ctx, result, dict):
+    """
+  Call the built in disassembler, then pass its output to a filter program
+  to add in disassembly for hidden opcodes.
+  Except for get and set, use the fdis command like the disassemble command.
+  By default, the filter program is crustfilt, from
+  https://github.com/quic/crustfilt . This can be changed by changing
+  the global variable filter_program.
+
+  Usage:
+    fdis [[get] [set <program>] [<disassembly options>]]
+
+    Choose one of the following:
+        get
+            Gets the current filter program
+
+        set <program>
+            Sets the current filter program. This can be an executable, which
+            will be found on PATH, or an absolute path.
+
+        <disassembly options>
+            If the first argument is not get or set, the args will be passed
+            to the disassemble command as is.
+
+    """
+
+    global filter_program
+    args_list = args.split(" ")
+    result.Clear()
+
+    if len(args_list) == 1 and args_list[0] == "get":
+        result.PutCString(str(filter_program))
+        result.SetStatus(lldb.eReturnStatusSuccessFinishResult)
+        return
+
+    if args_list[0] == "set":
+        # Assume the rest is a program to run and any arguments to be passed to
+        # it.
+        if len(args_list) <= 1:
+            result.PutCString('"set" command requires a program argument')
+            result.SetStatus(lldb.eReturnStatusFailed)
+            return
+
+        filter_program = Program(args_list[1:])
+        result.PutCString('Filter program set to "{}"'.format(filter_program))
+        result.SetStatus(lldb.eReturnStatusSuccessFinishResult)
+        return
+
+    res = lldb.SBCommandReturnObject()
+    debugger.GetCommandInterpreter().HandleCommand("disassemble -b " + args, exe_ctx, res)
+    if len(res.GetError()) > 0:
+        result.SetError(res.GetError())
+        result.SetStatus(lldb.eReturnStatusFailed)
+        return
+    output = res.GetOutput()
+
+    try:
+        proc = subprocess.run(
+            filter_program, capture_output=True, text=True, input=output
+        )
+    except (subprocess.SubprocessError, OSError) as e:
+        result.PutCString("Error occurred. Original disassembly:\n\n" + output)
+        result.SetError(str(e))
+        result.SetStatus(lldb.eReturnStatusFailed)
+        return
+
+    if proc.returncode:
+        result.PutCString("warning: {} returned non-zero value {}".format(filter_program, proc.returncode))
+
+    result.PutCString(proc.stdout)
+    result.SetStatus(lldb.eReturnStatusSuccessFinishResult)
diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
index 20b9488af5597..f42a009c21f48 100644
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -882,38 +882,6 @@ def update(self):
         return False
 
 
-def VariantSummaryProvider(valobj, dict):
-    raw_obj = valobj.GetNonSyntheticValue()
-    index_obj = raw_obj.GetChildMemberWithName("_M_index")
-    data_obj = raw_obj.GetChildMemberWithName("_M_u")
-    if not (index_obj and index_obj.IsValid() and data_obj and data_obj.IsValid()):
-        return "<Can't find _M_index or _M_u>"
-
-    def get_variant_npos_value(index_byte_size):
-        if index_byte_size == 1:
-            return 0xFF
-        elif index_byte_size == 2:
-            return 0xFFFF
-        else:
-            return 0xFFFFFFFF
-
-    npos_value = get_variant_npos_value(index_obj.GetByteSize())
-    index = index_obj.GetValueAsUnsigned(0)
-    if index == npos_value:
-        return " No Value"
-
-    # Strip references and typedefs.
-    variant_type = raw_obj.GetType().GetCanonicalType().GetDereferencedType()
-    template_arg_count = variant_type.GetNumberOfTemplateArguments()
-
-    # Invalid index can happen when the variant is not initialized yet.
-    if index >= template_arg_count:
-        return " <Invalid>"
-
-    active_type = variant_type.GetTemplateArgumentType(index)
-    return f" Active Type = {active_type.GetDisplayTypeName()} "
-
-
 class VariantSynthProvider:
     def __init__(self, valobj, dict):
         self.raw_obj = valobj.GetNonSyntheticValue()
diff --git a/lldb/include/lldb/Core/Opcode.h b/lldb/include/lldb/Core/Opcode.h
index f72f2687b54fe..91af15c62e6ab 100644
--- a/lldb/include/lldb/Core/Opcode.h
+++ b/lldb/include/lldb/Core/Opcode.h
@@ -32,7 +32,10 @@ class Opcode {
     eTypeInvalid,
     eType8,
     eType16,
-    eType16_2, // a 32-bit Thumb instruction, made up of two words
+    eType16_2,        // a 32-bit Thumb instruction, made up of two words
+    eType16_32Tuples, // RISC-V that can have 2, 4, 6, 8 etc byte long
+                      // instructions which will be printed in combinations of
+                      // 16 & 32-bit words.
     eType32,
     eType64,
     eTypeBytes
@@ -60,9 +63,9 @@ class Opcode {
     m_data.inst64 = inst;
   }
 
-  Opcode(uint8_t *bytes, size_t length)
-      : m_byte_order(lldb::eByteOrderInvalid) {
-    SetOpcodeBytes(bytes, length);
+  Opcode(uint8_t *bytes, size_t length, Opcode::Type type,
+         lldb::ByteOrder order) {
+    DoSetOpcodeBytes(bytes, length, type, order);
   }
 
   void Clear() {
@@ -82,6 +85,8 @@ class Opcode {
       break;
     case Opcode::eType16_2:
       break;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType32:
       break;
     case Opcode::eType64:
@@ -103,6 +108,8 @@ class Opcode {
                              : m_data.inst16;
     case Opcode::eType16_2:
       break;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType32:
       break;
     case Opcode::eType64:
@@ -122,6 +129,8 @@ class Opcode {
     case Opcode::eType16:
       return GetEndianSwap() ? llvm::byteswap<uint16_t>(m_data.inst16)
                              : m_data.inst16;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return GetEndianSwap() ? llvm::byteswap<uint32_t>(m_data.inst32)
@@ -143,6 +152,8 @@ class Opcode {
     case Opcode::eType16:
       return GetEndianSwap() ? llvm::byteswap<uint16_t>(m_data.inst16)
                              : m_data.inst16;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return GetEndianSwap() ? llvm::byteswap<uint32_t>(m_data.inst32)
@@ -186,20 +197,30 @@ class Opcode {
     m_byte_order = order;
   }
 
+  void SetOpcode16_32TupleBytes(const void *bytes, size_t length,
+                                lldb::ByteOrder order) {
+    DoSetOpcodeBytes(bytes, length, eType16_32Tuples, order);
+  }
+
   void SetOpcodeBytes(const void *bytes, size_t length) {
+    DoSetOpcodeBytes(bytes, length, eTypeBytes, lldb::eByteOrderInvalid);
+  }
+
+  void DoSetOpcodeBytes(const void *bytes, size_t length, Opcode::Type type,
+                        lldb::ByteOrder order) {
     if (bytes != nullptr && length > 0) {
-      m_type = eTypeBytes;
+      m_type = type;
       m_data.inst.length = length;
       assert(length < sizeof(m_data.inst.bytes));
       memcpy(m_data.inst.bytes, bytes, length);
-      m_byte_order = lldb::eByteOrderInvalid;
+      m_byte_order = order;
     } else {
       m_type = eTypeInvalid;
       m_data.inst.length = 0;
     }
   }
 
-  int Dump(Stream *s, uint32_t min_byte_width);
+  int Dump(Stream *s, uint32_t min_byte_width) const;
 
   const void *GetOpcodeBytes() const {
     return ((m_type == Opcode::eTypeBytes) ? m_data.inst.bytes : nullptr);
@@ -213,6 +234,8 @@ class Opcode {
       return sizeof(m_data.inst8);
     case Opcode::eType16:
       return sizeof(m_data.inst16);
+    case Opcode::eType16_32Tuples:
+      return m_data.inst.length;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return sizeof(m_data.inst32);
@@ -238,6 +261,8 @@ class Opcode {
       return &m_data.inst8;
     case Opcode::eType16:
       return &m_data.inst16;
+    case Opcode::eType16_32Tuples:
+      return m_data.inst.bytes;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return &m_data.inst32;
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index 5499e99025d8a..369785ceea5a5 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -787,6 +787,9 @@ class PluginManager {
 
   static std::vector<RegisteredPluginInfo> GetUnwindAssemblyPluginInfo();
   static bool SetUnwindAssemblyPluginEnabled(llvm::StringRef name, bool enable);
+
+  static void AutoCompletePluginName(llvm::StringRef partial_name,
+                                     CompletionRequest &request);
 };
 
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Interpreter/CommandCompletions.h b/lldb/include/lldb/Interpreter/CommandCompletions.h
index c7292b3b1471a..0c0424cbac6eb 100644
--- a/lldb/include/lldb/Interpreter/CommandCompletions.h
+++ b/lldb/include/lldb/Interpreter/CommandCompletions.h
@@ -123,6 +123,10 @@ class CommandCompletions {
   static void ThreadIDs(CommandInterpreter &interpreter,
                         CompletionRequest &request, SearchFilter *searcher);
 
+  static void ManagedPlugins(CommandInterpreter &interpreter,
+                             CompletionRequest &request,
+                             SearchFilter *searcher);
+
   /// This completer works for commands whose only arguments are a command path.
   /// It isn't tied to an argument type because it completes not on a single
   /// argument but on the sequence of arguments, so you have to invoke it by
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 69e8671b6e21b..e021c7a926cf1 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1321,10 +1321,11 @@ enum CompletionType {
   eTypeCategoryNameCompletion = (1ul << 24),
   eCustomCompletion = (1ul << 25),
   eThreadIDCompletion = (1ul << 26),
+  eManagedPluginCompletion = (1ul << 27),
   // This last enum element is just for input validation.
   // Add new completions before this element,
   // and then increment eTerminatorCompletion's shift value
-  eTerminatorCompletion = (1ul << 27)
+  eTerminatorCompletion = (1ul << 28)
 };
 
 /// Specifies if children need to be re-computed
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index a4ff96e4158ce..63fadb59a82a1 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -2268,7 +2268,7 @@ def completions_match(self, command, completions, max_completions=-1):
             completions, list(match_strings)[1:], "List of returned completion is wrong"
         )
 
-    def completions_contain(self, command, completions):
+    def completions_contain(self, command, completions, match=True):
         """Checks that the completions for the given command contain the given
         list of completions."""
         interp = self.dbg.GetCommandInterpreter()
@@ -2276,9 +2276,16 @@ def completions_contain(self, command, completions):
         interp.HandleCompletion(command, len(command), 0, -1, match_strings)
         for completion in completions:
             # match_strings is a 1-indexed list, so we have to slice...
-            self.assertIn(
-                completion, list(match_strings)[1:], "Couldn't find expected completion"
-            )
+            if match:
+                self.assertIn(
+                    completion,
+                    list(match_strings)[1:],
+                    "Couldn't find expected completion",
+                )
+            else:
+                self.assertNotIn(
+                    completion, list(match_strings)[1:], "Found unexpected completion"
+                )
 
     def filecheck(
         self, command, check_file, filecheck_options="", expect_cmd_failure=False
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 58833e1b0cc78..8521ca508a479 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -344,10 +344,6 @@ endif
 #----------------------------------------------------------------------
 ifeq "$(OS)" "Windows_NT"
 	ifeq ($(CC_TYPE), clang)
-		# Clang for Windows doesn't support C++ Exceptions
-		CXXFLAGS += -fno-exceptions
-		CXXFLAGS += -D_HAS_EXCEPTIONS=0
-
 		# MSVC 2015 or higher is required, which depends on c++14, so
 		# append these values unconditionally.
 		CXXFLAGS += -fms-compatibility-version=19.0
diff --git a/lldb/source/Commands/CommandCompletions.cpp b/lldb/source/Commands/CommandCompletions.cpp
index 38231a8e993c7..3e223090c4286 100644
--- a/lldb/source/Commands/CommandCompletions.cpp
+++ b/lldb/source/Commands/CommandCompletions.cpp
@@ -87,6 +87,7 @@ bool CommandCompletions::InvokeCommonCompletionCallbacks(
       {lldb::eTypeCategoryNameCompletion,
        CommandCompletions::TypeCategoryNames},
       {lldb::eThreadIDCompletion, CommandCompletions::ThreadIDs},
+      {lldb::eManagedPluginCompletion, CommandCompletions::ManagedPlugins},
       {lldb::eTerminatorCompletion,
        nullptr} // This one has to be last in the list.
   };
@@ -850,6 +851,13 @@ void CommandCompletions::ThreadIDs(CommandInterpreter &interpreter,
   }
 }
 
+void CommandCompletions::ManagedPlugins(CommandInterpreter &interpreter,
+                                        CompletionRequest &request,
+                                        SearchFilter *searcher) {
+  PluginManager::AutoCompletePluginName(request.GetCursorArgumentPrefix(),
+                                        request);
+}
+
 void CommandCompletions::CompleteModifiableCmdPathArgs(
     CommandInterpreter &interpreter, CompletionRequest &request,
     OptionElementVector &opt_element_vector) {
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index a110eececf4d6..a2c004d0ee97f 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -150,6 +150,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
         return;
       }
     }
+    m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(),
+                                           m_cmd_name);
     result.SetStatus(eReturnStatusSuccessFinishResult);
   };
 
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index a95dea63720ac..c5b91678103d5 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -470,6 +470,9 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr,
           return false;
         }
 
+        m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(),
+                                               m_cmd_name);
+
         if (suppress_result)
           if (auto result_var_sp =
                   target.GetPersistentVariable(result_valobj_sp->GetName())) {
diff --git a/lldb/source/Commands/CommandObjectPlugin.cpp b/lldb/source/Commands/CommandObjectPlugin.cpp
index cdc9006bf5261..093ebde5f5f2c 100644
--- a/lldb/source/Commands/CommandObjectPlugin.cpp
+++ b/lldb/source/Commands/CommandObjectPlugin.cpp
@@ -194,6 +194,14 @@ List only the plugin 'foo' matching a fully qualified name exactly
 
   Options *GetOptions() override { return &m_options; }
 
+  void
+  HandleArgumentCompletion(CompletionRequest &request,
+                           OptionElementVector &opt_element_vector) override {
+    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
+        GetCommandInterpreter(), lldb::eManagedPluginCompletion, request,
+        nullptr);
+  }
+
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     size_t argc = command.GetArgumentCount();
@@ -293,6 +301,14 @@ class CommandObjectPluginEnable : public CommandObjectParsed {
     AddSimpleArgumentList(eArgTypeManagedPlugin);
   }
 
+  void
+  HandleArgumentCompletion(CompletionRequest &request,
+                           OptionElementVector &opt_element_vector) override {
+    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
+        GetCommandInterpreter(), lldb::eManagedPluginCompletion, request,
+        nullptr);
+  }
+
   ~CommandObjectPluginEnable() override = default;
 
 protected:
@@ -309,6 +325,14 @@ class CommandObjectPluginDisable : public CommandObjectParsed {
     AddSimpleArgumentList(eArgTypeManagedPlugin);
   }
 
+  void
+  HandleArgumentCompletion(CompletionRequest &request,
+                           OptionElementVector &opt_element_vector) override {
+    lldb_private::CommandCompletions::InvokeCommonCompletionCallbacks(
+        GetCommandInterpreter(), lldb::eManagedPluginCompletion, request,
+        nullptr);
+  }
+
   ~CommandObjectPluginDisable() override = default;
 
 protected:
diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp
index 833e327579a29..925de2a5c836c 100644
--- a/lldb/source/Core/Disassembler.cpp
+++ b/lldb/source/Core/Disassembler.cpp
@@ -685,10 +685,12 @@ void Instruction::Dump(lldb_private::Stream *s, uint32_t max_opcode_byte_size,
     }
   }
   const size_t opcode_pos = ss.GetSizeOfLastLine();
-  const std::string &opcode_name =
-      show_color ? m_markup_opcode_name : m_opcode_name;
+  std::string &opcode_name = show_color ? m_markup_opcode_name : m_opcode_name;
   const std::string &mnemonics = show_color ? m_markup_mnemonics : m_mnemonics;
 
+  if (opcode_name.empty())
+    opcode_name = "<unknown>";
+
   // The default opcode size of 7 characters is plenty for most architectures
   // but some like arm can pull out the occasional vqrshrun.s16.  We won't get
   // consistent column spacing in these cases, unfortunately. Also note that we
diff --git a/lldb/source/Core/Opcode.cpp b/lldb/source/Core/Opcode.cpp
index 3e30d98975d8a..6c9ced9c11230 100644
--- a/lldb/source/Core/Opcode.cpp
+++ b/lldb/source/Core/Opcode.cpp
@@ -21,7 +21,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-int Opcode::Dump(Stream *s, uint32_t min_byte_width) {
+int Opcode::Dump(Stream *s, uint32_t min_byte_width) const {
   const uint32_t previous_bytes = s->GetWrittenBytes();
   switch (m_type) {
   case Opcode::eTypeInvalid:
@@ -38,6 +38,27 @@ int Opcode::Dump(Stream *s, uint32_t min_byte_width) {
     s->Printf("0x%8.8x", m_data.inst32);
     break;
 
+  case Opcode::eType16_32Tuples: {
+    const bool format_as_words = (m_data.inst.length % 4) == 0;
+    uint32_t i = 0;
+    while (i < m_data.inst.length) {
+      if (i > 0)
+        s->PutChar(' ');
+      if (format_as_words) {
+        // Format as words; print 1 or more UInt32 values.
+        s->Printf("%2.2x%2.2x%2.2x%2.2x", m_data.inst.bytes[i + 3],
+                  m_data.inst.bytes[i + 2], m_data.inst.bytes[i + 1],
+                  m_data.inst.bytes[i + 0]);
+        i += 4;
+      } else {
+        // Format as halfwords; print 1 or more UInt16 values.
+        s->Printf("%2.2x%2.2x", m_data.inst.bytes[i + 1],
+                  m_data.inst.bytes[i + 0]);
+        i += 2;
+      }
+    }
+  } break;
+
   case Opcode::eType64:
     s->Printf("0x%16.16" PRIx64, m_data.inst64);
     break;
@@ -69,6 +90,7 @@ lldb::ByteOrder Opcode::GetDataByteOrder() const {
   case Opcode::eType8:
   case Opcode::eType16:
   case Opcode::eType16_2:
+  case Opcode::eType16_32Tuples:
   case Opcode::eType32:
   case Opcode::eType64:
     return endian::InlHostByteOrder();
@@ -113,6 +135,9 @@ uint32_t Opcode::GetData(DataExtractor &data) const {
         swap_buf[3] = m_data.inst.bytes[2];
         buf = swap_buf;
         break;
+      case Opcode::eType16_32Tuples:
+        buf = GetOpcodeDataBytes();
+        break;
       case Opcode::eType32:
         *(uint32_t *)swap_buf = llvm::byteswap<uint32_t>(m_data.inst32);
         buf = swap_buf;
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 3f20a96edc187..bece690a85fa5 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -18,6 +18,7 @@
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/StringList.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -2473,3 +2474,34 @@ bool PluginManager::SetUnwindAssemblyPluginEnabled(llvm::StringRef name,
                                                    bool enable) {
   return GetUnwindAssemblyInstances().SetInstanceEnabled(name, enable);
 }
+
+void PluginManager::AutoCompletePluginName(llvm::StringRef name,
+                                           CompletionRequest &request) {
+  // Split the name into the namespace and the plugin name.
+  // If there is no dot then the ns_name will be equal to name and
+  // plugin_prefix will be empty.
+  llvm::StringRef ns_name, plugin_prefix;
+  std::tie(ns_name, plugin_prefix) = name.split('.');
+
+  for (const PluginNamespace &plugin_ns : GetPluginNamespaces()) {
+    // If the plugin namespace matches exactly then
+    // add all the plugins in this namespace as completions if the
+    // plugin names starts with the plugin_prefix. If the plugin_prefix
+    // is empty then it will match all the plugins (empty string is a
+    // prefix of everything).
+    if (plugin_ns.name == ns_name) {
+      for (const RegisteredPluginInfo &plugin : plugin_ns.get_info()) {
+        llvm::SmallString<128> buf;
+        if (plugin.name.starts_with(plugin_prefix))
+          request.AddCompletion(
+              (plugin_ns.name + "." + plugin.name).toStringRef(buf));
+      }
+    } else if (plugin_ns.name.starts_with(name) &&
+               !plugin_ns.get_info().empty()) {
+      // Otherwise check if the namespace is a prefix of the full name.
+      // Use a partial completion here so that we can either operate on the full
+      // namespace or tab-complete to the next level.
+      request.AddCompletion(plugin_ns.name, "", CompletionMode::Partial);
+    }
+  }
+}
diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp
index 405acbb5662d6..196f54b93538d 100644
--- a/lldb/source/Host/common/NativeProcessProtocol.cpp
+++ b/lldb/source/Host/common/NativeProcessProtocol.cpp
@@ -366,12 +366,19 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) {
   if (--it->second.ref_count > 0)
     return Status();
 
+  // Remove the entry from m_software_breakpoints rightaway, so that we don't
+  // leave behind an entry with ref_count == 0 in case one of the following
+  // conditions returns an error. The breakpoint is moved so that it can be
+  // accessed below.
+  SoftwareBreakpoint bkpt = std::move(it->second);
+  m_software_breakpoints.erase(it);
+
   // This is the last reference. Let's remove the breakpoint.
   Status error;
 
   // Clear a software breakpoint instruction
-  llvm::SmallVector<uint8_t, 4> curr_break_op(
-      it->second.breakpoint_opcodes.size(), 0);
+  llvm::SmallVector<uint8_t, 4> curr_break_op(bkpt.breakpoint_opcodes.size(),
+                                              0);
 
   // Read the breakpoint opcode
   size_t bytes_read = 0;
@@ -382,10 +389,10 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) {
         "addr=0x%" PRIx64 ": tried to read %zu bytes but only read %zu", addr,
         curr_break_op.size(), bytes_read);
   }
-  const auto &saved = it->second.saved_opcodes;
+  const auto &saved = bkpt.saved_opcodes;
   // Make sure the breakpoint opcode exists at this address
-  if (llvm::ArrayRef(curr_break_op) != it->second.breakpoint_opcodes) {
-    if (curr_break_op != it->second.saved_opcodes)
+  if (llvm::ArrayRef(curr_break_op) != bkpt.breakpoint_opcodes) {
+    if (curr_break_op != bkpt.saved_opcodes)
       return Status::FromErrorString(
           "Original breakpoint trap is no longer in memory.");
     LLDB_LOG(log,
@@ -418,7 +425,6 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) {
                llvm::make_range(saved.begin(), saved.end()));
   }
 
-  m_software_breakpoints.erase(it);
   return Status();
 }
 
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 00c3472444d2e..da545f18d9b15 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -616,7 +616,8 @@ void CommandInterpreter::LoadCommandDictionary() {
   std::unique_ptr<CommandObjectRegexCommand> break_regex_cmd_up(
       new CommandObjectRegexCommand(
           *this, "_regexp-break",
-          "Set a breakpoint using one of several shorthand formats.",
+          "Set a breakpoint using one of several shorthand formats, or list "
+          "the existing breakpoints if no arguments are provided.",
           "\n"
           "_regexp-break <filename>:<linenum>:<colnum>\n"
           "              main.c:12:21          // Break at line 12 and column "
@@ -643,7 +644,10 @@ void CommandInterpreter::LoadCommandDictionary() {
           "              /break here/          // Break on source lines in "
           "current file\n"
           "                                    // containing text 'break "
-          "here'.\n",
+          "here'.\n"
+          "_regexp-break\n"
+          "                                    // List the existing "
+          "breakpoints\n",
           lldb::eSymbolCompletion | lldb::eSourceFileCompletion, false));
 
   if (break_regex_cmd_up) {
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index 644084ba8d57a..d92e16366f0ac 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -59,8 +59,8 @@ class DisassemblerLLVMC::MCDisasmInstance {
 
   ~MCDisasmInstance() = default;
 
-  uint64_t GetMCInst(const uint8_t *opcode_data, size_t opcode_data_len,
-                     lldb::addr_t pc, llvm::MCInst &mc_inst) const;
+  bool GetMCInst(const uint8_t *opcode_data, size_t opcode_data_len,
+                 lldb::addr_t pc, llvm::MCInst &mc_inst, uint64_t &size) const;
   void PrintMCInst(llvm::MCInst &mc_inst, lldb::addr_t pc,
                    std::string &inst_string, std::string &comments_string);
   void SetStyle(bool use_hex_immed, HexImmediateStyle hex_style);
@@ -486,8 +486,13 @@ class InstructionLLVMC : public lldb_private::Instruction {
           break;
 
         default:
-          m_opcode.SetOpcodeBytes(data.PeekData(data_offset, min_op_byte_size),
-                                  min_op_byte_size);
+          if (arch.GetTriple().isRISCV())
+            m_opcode.SetOpcode16_32TupleBytes(
+                data.PeekData(data_offset, min_op_byte_size), min_op_byte_size,
+                byte_order);
+          else
+            m_opcode.SetOpcodeBytes(
+                data.PeekData(data_offset, min_op_byte_size), min_op_byte_size);
           got_op = true;
           break;
         }
@@ -524,13 +529,16 @@ class InstructionLLVMC : public lldb_private::Instruction {
           const addr_t pc = m_address.GetFileAddress();
           llvm::MCInst inst;
 
-          const size_t inst_size =
-              mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
-          if (inst_size == 0)
-            m_opcode.Clear();
-          else {
-            m_opcode.SetOpcodeBytes(opcode_data, inst_size);
-            m_is_valid = true;
+          uint64_t inst_size = 0;
+          m_is_valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len,
+                                                pc, inst, inst_size);
+          m_opcode.Clear();
+          if (inst_size != 0) {
+            if (arch.GetTriple().isRISCV())
+              m_opcode.SetOpcode16_32TupleBytes(opcode_data, inst_size,
+                                                byte_order);
+            else
+              m_opcode.SetOpcodeBytes(opcode_data, inst_size);
           }
         }
       }
@@ -604,10 +612,11 @@ class InstructionLLVMC : public lldb_private::Instruction {
         const uint8_t *opcode_data = data.GetDataStart();
         const size_t opcode_data_len = data.GetByteSize();
         llvm::MCInst inst;
-        size_t inst_size =
-            mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
+        uint64_t inst_size = 0;
+        bool valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc,
+                                              inst, inst_size);
 
-        if (inst_size > 0) {
+        if (valid && inst_size > 0) {
           mc_disasm_ptr->SetStyle(use_hex_immediates, hex_style);
 
           const bool saved_use_color = mc_disasm_ptr->GetUseColor();
@@ -1206,9 +1215,10 @@ class InstructionLLVMC : public lldb_private::Instruction {
     const uint8_t *opcode_data = data.GetDataStart();
     const size_t opcode_data_len = data.GetByteSize();
     llvm::MCInst inst;
-    const size_t inst_size =
-        mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
-    if (inst_size == 0)
+    uint64_t inst_size = 0;
+    const bool valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len,
+                                                pc, inst, inst_size);
+    if (!valid)
       return;
 
     m_has_visited_instruction = true;
@@ -1337,19 +1347,19 @@ DisassemblerLLVMC::MCDisasmInstance::MCDisasmInstance(
          m_asm_info_up && m_context_up && m_disasm_up && m_instr_printer_up);
 }
 
-uint64_t DisassemblerLLVMC::MCDisasmInstance::GetMCInst(
-    const uint8_t *opcode_data, size_t opcode_data_len, lldb::addr_t pc,
-    llvm::MCInst &mc_inst) const {
+bool DisassemblerLLVMC::MCDisasmInstance::GetMCInst(const uint8_t *opcode_data,
+                                                    size_t opcode_data_len,
+                                                    lldb::addr_t pc,
+                                                    llvm::MCInst &mc_inst,
+                                                    uint64_t &size) const {
   llvm::ArrayRef<uint8_t> data(opcode_data, opcode_data_len);
   llvm::MCDisassembler::DecodeStatus status;
 
-  uint64_t new_inst_size;
-  status = m_disasm_up->getInstruction(mc_inst, new_inst_size, data, pc,
-                                       llvm::nulls());
+  status = m_disasm_up->getInstruction(mc_inst, size, data, pc, llvm::nulls());
   if (status == llvm::MCDisassembler::Success)
-    return new_inst_size;
+    return true;
   else
-    return 0;
+    return false;
 }
 
 void DisassemblerLLVMC::MCDisasmInstance::PrintMCInst(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
index 296159ea28407..5905d9b9a6d03 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
@@ -14,11 +14,11 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   CxxStringTypes.cpp
   Generic.cpp
   GenericBitset.cpp
+  GenericList.cpp
   GenericOptional.cpp
   LibCxx.cpp
   LibCxxAtomic.cpp
   LibCxxInitializerList.cpp
-  LibCxxList.cpp
   LibCxxMap.cpp
   LibCxxQueue.cpp
   LibCxxRangesRefView.cpp
@@ -35,6 +35,8 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   LibStdcppUniquePointer.cpp
   MsvcStl.cpp
   MsvcStlSmartPointer.cpp
+  MsvcStlTuple.cpp
+  MsvcStlVector.cpp
   MSVCUndecoratedNameParser.cpp
 
   LINK_COMPONENTS
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 9a869f3ea0289..a8ebde0b55815 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -1404,7 +1404,7 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   stl_deref_flags.SetFrontEndWantsDereference();
 
   cpp_category_sp->AddTypeSynthetic(
-      "^std::(__debug::)?vector<.+>(( )?&)?$", eFormatterMatchRegex,
+      "^std::__debug::vector<.+>(( )?&)?$", eFormatterMatchRegex,
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_synth_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdVectorSynthProvider")));
@@ -1440,14 +1440,12 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
           stl_deref_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdUnorderedMapSynthProvider")));
   cpp_category_sp->AddTypeSynthetic(
-      "^std::((__debug::)?|(__cxx11::)?)list<.+>(( )?&)?$",
-      eFormatterMatchRegex,
+      "^std::__(debug|cxx11)::list<.+>(( )?&)?$", eFormatterMatchRegex,
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_deref_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdListSynthProvider")));
   cpp_category_sp->AddTypeSynthetic(
-      "^std::((__debug::)?|(__cxx11::)?)forward_list<.+>(( )?&)?$",
-      eFormatterMatchRegex,
+      "^std::__(debug|cxx11)::forward_list<.+>(( )?&)?$", eFormatterMatchRegex,
       SyntheticChildrenSP(new ScriptedSyntheticChildren(
           stl_synth_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.StdForwardListSynthProvider")));
@@ -1465,10 +1463,10 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       "libstdc++ std::bitset summary provider",
       "^std::(__debug::)?bitset<.+>(( )?&)?$", stl_summary_flags, true);
 
-  AddCXXSummary(
-      cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider,
-      "libstdc++ std::vector summary provider",
-      "^std::(__debug::)?vector<.+>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::ContainerSizeSummaryProvider,
+                "libstdc++ std::__debug::vector summary provider",
+                "^std::__debug::vector<.+>(( )?&)?$", stl_summary_flags, true);
 
   AddCXXSummary(
       cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider,
@@ -1501,27 +1499,19 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 "^std::(__debug::)?unordered_(multi)?(map|set)<.+> >$",
                 stl_summary_flags, true);
 
-  AddCXXSummary(cpp_category_sp,
-                lldb_private::formatters::ContainerSizeSummaryProvider,
-                "libstdc++ std::list summary provider",
-                "^std::((__debug::)?|(__cxx11::)?)list<.+>(( )?&)?$",
-                stl_summary_flags, true);
-
-  AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
-                "libstdc++ std::tuple summary provider",
-                "^std::tuple<.*>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(
+      cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider,
+      "libstdc++ debug std::list summary provider",
+      "^std::__(debug|cxx11)::list<.+>(( )?&)?$", stl_summary_flags, true);
 
   cpp_category_sp->AddTypeSummary(
-      "^std::((__debug::)?|(__cxx11::)?)forward_list<.+>(( )?&)?$",
-      eFormatterMatchRegex,
+      "^std::__(debug|cxx11)::forward_list<.+>(( )?&)?$", eFormatterMatchRegex,
       TypeSummaryImplSP(new ScriptSummaryFormat(
           stl_summary_flags,
           "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider")));
-  cpp_category_sp->AddTypeSummary(
-      "^std::variant<.+>$", eFormatterMatchRegex,
-      TypeSummaryImplSP(new ScriptSummaryFormat(
-          stl_summary_flags,
-          "lldb.formatters.cpp.gnu_libstdcpp.VariantSummaryProvider")));
+  AddCXXSummary(cpp_category_sp, LibStdcppVariantSummaryProvider,
+                "libstdc++ std::variant summary provider", "^std::variant<.+>$",
+                stl_summary_flags, true);
 
   AddCXXSynthetic(
       cpp_category_sp,
@@ -1540,11 +1530,6 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       lldb_private::formatters::LibStdcppUniquePtrSyntheticFrontEndCreator,
       "std::unique_ptr synthetic children", "^std::unique_ptr<.+>(( )?&)?$",
       stl_synth_flags, true);
-  AddCXXSynthetic(
-      cpp_category_sp,
-      lldb_private::formatters::LibStdcppTupleSyntheticFrontEndCreator,
-      "std::tuple synthetic children", "^std::tuple<.*>(( )?&)?$",
-      stl_synth_flags, true);
 
   static constexpr const char *const libstdcpp_std_coroutine_handle_regex =
       "^std::coroutine_handle<.+>(( )?&)?$";
@@ -1613,6 +1598,56 @@ static bool GenericUniquePtrSummaryProvider(ValueObject &valobj, Stream &stream,
   return LibStdcppUniquePointerSummaryProvider(valobj, stream, options);
 }
 
+static SyntheticChildrenFrontEnd *
+GenericTupleSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                     lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlTuple(*valobj_sp))
+    return MsvcStlTupleSyntheticFrontEndCreator(children, valobj_sp);
+  return LibStdcppTupleSyntheticFrontEndCreator(children, valobj_sp);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericVectorSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                      lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  // checks for vector<T> and vector<bool>
+  if (auto *msvc = MsvcStlVectorSyntheticFrontEndCreator(valobj_sp))
+    return msvc;
+
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.StdVectorSynthProvider", *valobj_sp);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericListSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                    lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlList(*valobj_sp))
+    return MsvcStlListSyntheticFrontEndCreator(children, valobj_sp);
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.StdListSynthProvider", *valobj_sp);
+}
+
+static SyntheticChildrenFrontEnd *
+GenericForwardListSyntheticFrontEndCreator(CXXSyntheticChildren *children,
+                                           lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  if (IsMsvcStlList(*valobj_sp))
+    return MsvcStlForwardListSyntheticFrontEndCreator(children, valobj_sp);
+  return new ScriptedSyntheticChildren::FrontEnd(
+      "lldb.formatters.cpp.gnu_libstdcpp.StdForwardListSynthProvider",
+      *valobj_sp);
+}
+
 /// Load formatters that are formatting types from more than one STL
 static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   if (!cpp_category_sp)
@@ -1668,6 +1703,15 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   AddCXXSynthetic(cpp_category_sp, GenericUniquePtrSyntheticFrontEndCreator,
                   "std::unique_ptr synthetic children",
                   "^std::unique_ptr<.+>(( )?&)?$", stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericTupleSyntheticFrontEndCreator,
+                  "std::tuple synthetic children", "^std::tuple<.*>(( )?&)?$",
+                  stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericListSyntheticFrontEndCreator,
+                  "std::list synthetic children", "^std::list<.+>(( )?&)?$",
+                  stl_synth_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericForwardListSyntheticFrontEndCreator,
+                  "std::forward_list synthetic children",
+                  "^std::forward_list<.+>(( )?&)?$", stl_synth_flags, true);
 
   AddCXXSummary(cpp_category_sp, GenericSmartPointerSummaryProvider,
                 "MSVC STL/libstdc++ std::shared_ptr summary provider",
@@ -1678,6 +1722,23 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   AddCXXSummary(cpp_category_sp, GenericUniquePtrSummaryProvider,
                 "MSVC STL/libstdc++ std::unique_ptr summary provider",
                 "^std::unique_ptr<.+>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
+                "MSVC STL/libstdc++ std::tuple summary provider",
+                "^std::tuple<.*>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
+                "MSVC/libstdc++ std::vector summary provider",
+                "^std::vector<.+>(( )?&)?$", stl_summary_flags, true);
+  AddCXXSynthetic(cpp_category_sp, GenericVectorSyntheticFrontEndCreator,
+                  "MSVC/libstdc++ std::vector synthetic provider",
+                  "^std::vector<.+>(( )?&)?$", stl_synth_flags, true);
+  AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider,
+                "MSVC STL/libstdc++ std::list summary provider",
+                "^std::list<.+>(( )?&)?$", stl_summary_flags, true);
+  cpp_category_sp->AddTypeSummary(
+      "^std::forward_list<.+>(( )?&)?$", eFormatterMatchRegex,
+      TypeSummaryImplSP(new ScriptSummaryFormat(
+          stl_summary_flags,
+          "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider")));
 }
 
 static void LoadMsvcStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
similarity index 58%
rename from lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
rename to lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
index 826e6ab090e10..ea1edbfd3ac9b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
@@ -1,4 +1,4 @@
-//===-- LibCxxList.cpp ----------------------------------------------------===//
+//===-- GenericList.cpp ---------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,14 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "LibCxx.h"
+#include "MsvcStl.h"
 
-#include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/DataFormatters/FormattersHelpers.h"
 #include "lldb/Target/Target.h"
-#include "lldb/Utility/DataBufferHeap.h"
-#include "lldb/Utility/Endian.h"
 #include "lldb/Utility/Status.h"
-#include "lldb/Utility/Stream.h"
 #include "lldb/ValueObject/ValueObject.h"
 #include "lldb/ValueObject/ValueObjectConstResult.h"
 #include "lldb/lldb-enumerations.h"
@@ -25,31 +22,27 @@ using namespace lldb_private::formatters;
 
 namespace {
 
-class ListEntry {
+enum class StlType {
+  LibCxx,
+  MsvcStl,
+};
+
+template <StlType Stl> class ListEntry {
 public:
   ListEntry() = default;
   ListEntry(ValueObjectSP entry_sp) : m_entry_sp(std::move(entry_sp)) {}
   ListEntry(ValueObject *entry)
       : m_entry_sp(entry ? entry->GetSP() : ValueObjectSP()) {}
 
-  ListEntry next() {
-    if (!m_entry_sp)
-      return ListEntry();
-    return ListEntry(m_entry_sp->GetChildMemberWithName("__next_"));
-  }
-
-  ListEntry prev() {
-    if (!m_entry_sp)
-      return ListEntry();
-    return ListEntry(m_entry_sp->GetChildMemberWithName("__prev_"));
-  }
-
   uint64_t value() const {
     if (!m_entry_sp)
       return 0;
     return m_entry_sp->GetValueAsUnsigned(0);
   }
 
+  ListEntry next();
+  ListEntry prev();
+
   bool null() { return (value() == 0); }
 
   explicit operator bool() { return GetEntry() && !null(); }
@@ -66,10 +59,34 @@ class ListEntry {
   ValueObjectSP m_entry_sp;
 };
 
-class ListIterator {
+template <> ListEntry<StlType::LibCxx> ListEntry<StlType::LibCxx>::next() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("__next_"));
+}
+
+template <> ListEntry<StlType::LibCxx> ListEntry<StlType::LibCxx>::prev() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("__prev_"));
+}
+
+template <> ListEntry<StlType::MsvcStl> ListEntry<StlType::MsvcStl>::next() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("_Next"));
+}
+
+template <> ListEntry<StlType::MsvcStl> ListEntry<StlType::MsvcStl>::prev() {
+  if (!m_entry_sp)
+    return ListEntry();
+  return ListEntry(m_entry_sp->GetChildMemberWithName("_Prev"));
+}
+
+template <StlType Stl> class ListIterator {
 public:
   ListIterator() = default;
-  ListIterator(ListEntry entry) : m_entry(std::move(entry)) {}
+  ListIterator(ListEntry<Stl> entry) : m_entry(std::move(entry)) {}
   ListIterator(ValueObjectSP entry) : m_entry(std::move(entry)) {}
   ListIterator(ValueObject *entry) : m_entry(entry) {}
 
@@ -101,9 +118,10 @@ class ListIterator {
   void prev() { m_entry = m_entry.prev(); }
 
 private:
-  ListEntry m_entry;
+  ListEntry<Stl> m_entry;
 };
 
+template <StlType Stl>
 class AbstractListFrontEnd : public SyntheticChildrenFrontEnd {
 public:
   llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override {
@@ -124,33 +142,31 @@ class AbstractListFrontEnd : public SyntheticChildrenFrontEnd {
   ValueObject *m_head = nullptr;
 
   static constexpr bool g_use_loop_detect = true;
-  size_t m_loop_detected = 0; // The number of elements that have had loop
-                              // detection run over them.
-  ListEntry m_slow_runner; // Used for loop detection
-  ListEntry m_fast_runner; // Used for loop detection
+  size_t m_loop_detected = 0;   // The number of elements that have had loop
+                                // detection run over them.
+  ListEntry<Stl> m_slow_runner; // Used for loop detection
+  ListEntry<Stl> m_fast_runner; // Used for loop detection
 
   size_t m_list_capping_size = 0;
   CompilerType m_element_type;
-  std::map<size_t, ListIterator> m_iterators;
+  std::map<size_t, ListIterator<Stl>> m_iterators;
 
   bool HasLoop(size_t count);
   ValueObjectSP GetItem(size_t idx);
 };
 
-class ForwardListFrontEnd : public AbstractListFrontEnd {
+class LibCxxForwardListFrontEnd : public AbstractListFrontEnd<StlType::LibCxx> {
 public:
-  ForwardListFrontEnd(ValueObject &valobj);
+  LibCxxForwardListFrontEnd(ValueObject &valobj);
 
   llvm::Expected<uint32_t> CalculateNumChildren() override;
   ValueObjectSP GetChildAtIndex(uint32_t idx) override;
   lldb::ChildCacheState Update() override;
 };
 
-class ListFrontEnd : public AbstractListFrontEnd {
+class LibCxxListFrontEnd : public AbstractListFrontEnd<StlType::LibCxx> {
 public:
-  ListFrontEnd(lldb::ValueObjectSP valobj_sp);
-
-  ~ListFrontEnd() override = default;
+  LibCxxListFrontEnd(lldb::ValueObjectSP valobj_sp);
 
   llvm::Expected<uint32_t> CalculateNumChildren() override;
 
@@ -163,9 +179,34 @@ class ListFrontEnd : public AbstractListFrontEnd {
   ValueObject *m_tail = nullptr;
 };
 
+class MsvcStlForwardListFrontEnd
+    : public AbstractListFrontEnd<StlType::MsvcStl> {
+public:
+  MsvcStlForwardListFrontEnd(ValueObject &valobj);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+  ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+  lldb::ChildCacheState Update() override;
+};
+
+class MsvcStlListFrontEnd : public AbstractListFrontEnd<StlType::MsvcStl> {
+public:
+  MsvcStlListFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+private:
+  ValueObject *m_tail = nullptr;
+};
+
 } // end anonymous namespace
 
-lldb::ChildCacheState AbstractListFrontEnd::Update() {
+template <StlType Stl>
+lldb::ChildCacheState AbstractListFrontEnd<Stl>::Update() {
   m_loop_detected = 0;
   m_count = UINT32_MAX;
   m_head = nullptr;
@@ -191,7 +232,7 @@ lldb::ChildCacheState AbstractListFrontEnd::Update() {
   return lldb::ChildCacheState::eRefetch;
 }
 
-bool AbstractListFrontEnd::HasLoop(size_t count) {
+template <StlType Stl> bool AbstractListFrontEnd<Stl>::HasLoop(size_t count) {
   if (!g_use_loop_detect)
     return false;
   // don't bother checking for a loop if we won't actually need to jump nodes
@@ -201,7 +242,7 @@ bool AbstractListFrontEnd::HasLoop(size_t count) {
   if (m_loop_detected == 0) {
     // This is the first time we are being run (after the last update). Set up
     // the loop invariant for the first element.
-    m_slow_runner = ListEntry(m_head).next();
+    m_slow_runner = ListEntry<Stl>(m_head).next();
     m_fast_runner = m_slow_runner.next();
     m_loop_detected = 1;
   }
@@ -225,9 +266,10 @@ bool AbstractListFrontEnd::HasLoop(size_t count) {
   return m_slow_runner == m_fast_runner;
 }
 
-ValueObjectSP AbstractListFrontEnd::GetItem(size_t idx) {
+template <StlType Stl>
+ValueObjectSP AbstractListFrontEnd<Stl>::GetItem(size_t idx) {
   size_t advance = idx;
-  ListIterator current(m_head);
+  ListIterator<Stl> current(m_head);
   if (idx > 0) {
     auto cached_iterator = m_iterators.find(idx - 1);
     if (cached_iterator != m_iterators.end()) {
@@ -240,16 +282,16 @@ ValueObjectSP AbstractListFrontEnd::GetItem(size_t idx) {
   return value_sp;
 }
 
-ForwardListFrontEnd::ForwardListFrontEnd(ValueObject &valobj)
+LibCxxForwardListFrontEnd::LibCxxForwardListFrontEnd(ValueObject &valobj)
     : AbstractListFrontEnd(valobj) {
   Update();
 }
 
-llvm::Expected<uint32_t> ForwardListFrontEnd::CalculateNumChildren() {
+llvm::Expected<uint32_t> LibCxxForwardListFrontEnd::CalculateNumChildren() {
   if (m_count != UINT32_MAX)
     return m_count;
 
-  ListEntry current(m_head);
+  ListEntry<StlType::LibCxx> current(m_head);
   m_count = 0;
   while (current && m_count < m_list_capping_size) {
     ++m_count;
@@ -258,7 +300,7 @@ llvm::Expected<uint32_t> ForwardListFrontEnd::CalculateNumChildren() {
   return m_count;
 }
 
-ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
+ValueObjectSP LibCxxForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
   if (idx >= CalculateNumChildrenIgnoringErrors())
     return nullptr;
 
@@ -289,7 +331,7 @@ ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
                                    m_element_type);
 }
 
-lldb::ChildCacheState ForwardListFrontEnd::Update() {
+lldb::ChildCacheState LibCxxForwardListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
 
   Status err;
@@ -312,13 +354,13 @@ lldb::ChildCacheState ForwardListFrontEnd::Update() {
   return ChildCacheState::eRefetch;
 }
 
-ListFrontEnd::ListFrontEnd(lldb::ValueObjectSP valobj_sp)
+LibCxxListFrontEnd::LibCxxListFrontEnd(lldb::ValueObjectSP valobj_sp)
     : AbstractListFrontEnd(*valobj_sp) {
   if (valobj_sp)
     Update();
 }
 
-llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
+llvm::Expected<uint32_t> LibCxxListFrontEnd::CalculateNumChildren() {
   if (m_count != UINT32_MAX)
     return m_count;
   if (!m_head || !m_tail || m_node_address == 0)
@@ -351,7 +393,7 @@ llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
   if (next_val == prev_val)
     return 1;
   uint64_t size = 2;
-  ListEntry current(m_head);
+  ListEntry<StlType::LibCxx> current(m_head);
   while (current.next() && current.next().value() != m_node_address) {
     size++;
     current = current.next();
@@ -361,7 +403,7 @@ llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
   return m_count = (size - 1);
 }
 
-lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(uint32_t idx) {
+lldb::ValueObjectSP LibCxxListFrontEnd::GetChildAtIndex(uint32_t idx) {
   static ConstString g_value("__value_");
   static ConstString g_next("__next_");
 
@@ -412,7 +454,7 @@ lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(uint32_t idx) {
                                    m_element_type);
 }
 
-lldb::ChildCacheState ListFrontEnd::Update() {
+lldb::ChildCacheState LibCxxListFrontEnd::Update() {
   AbstractListFrontEnd::Update();
   m_tail = nullptr;
   m_node_address = 0;
@@ -432,13 +474,167 @@ lldb::ChildCacheState ListFrontEnd::Update() {
   return lldb::ChildCacheState::eRefetch;
 }
 
+MsvcStlForwardListFrontEnd::MsvcStlForwardListFrontEnd(ValueObject &valobj)
+    : AbstractListFrontEnd(valobj) {
+  Update();
+}
+
+llvm::Expected<uint32_t> MsvcStlForwardListFrontEnd::CalculateNumChildren() {
+  if (m_count != UINT32_MAX)
+    return m_count;
+
+  ListEntry<StlType::MsvcStl> current(m_head);
+  m_count = 0;
+  while (current && m_count < m_list_capping_size) {
+    ++m_count;
+    current = current.next();
+  }
+  return m_count;
+}
+
+ValueObjectSP MsvcStlForwardListFrontEnd::GetChildAtIndex(uint32_t idx) {
+  if (idx >= CalculateNumChildrenIgnoringErrors())
+    return nullptr;
+
+  if (!m_head)
+    return nullptr;
+
+  if (HasLoop(idx + 1))
+    return nullptr;
+
+  ValueObjectSP current_sp = GetItem(idx);
+  if (!current_sp)
+    return nullptr;
+
+  current_sp = current_sp->GetChildAtIndex(1); // get the _Myval child
+  if (!current_sp)
+    return nullptr;
+
+  // we need to copy current_sp into a new object otherwise we will end up with
+  // all items named _Myval
+  DataExtractor data;
+  Status error;
+  current_sp->GetData(data, error);
+  if (error.Fail())
+    return nullptr;
+
+  return CreateValueObjectFromData(llvm::formatv("[{0}]", idx).str(), data,
+                                   m_backend.GetExecutionContextRef(),
+                                   m_element_type);
+}
+
+lldb::ChildCacheState MsvcStlForwardListFrontEnd::Update() {
+  AbstractListFrontEnd::Update();
+
+  if (auto head_sp =
+          m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"}))
+    m_head = head_sp.get();
+
+  return ChildCacheState::eRefetch;
+}
+
+MsvcStlListFrontEnd::MsvcStlListFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : AbstractListFrontEnd(*valobj_sp) {
+  if (valobj_sp)
+    Update();
+}
+
+llvm::Expected<uint32_t> MsvcStlListFrontEnd::CalculateNumChildren() {
+  if (m_count != UINT32_MAX)
+    return m_count;
+  if (!m_head || !m_tail)
+    return 0;
+
+  auto size_sp =
+      m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Mysize"});
+  if (!size_sp)
+    return llvm::createStringError("Failed to resolve size.");
+
+  m_count = size_sp->GetValueAsUnsigned(UINT32_MAX);
+  if (m_count == UINT32_MAX)
+    return llvm::createStringError("Failed to read size value.");
+
+  return m_count;
+}
+
+lldb::ValueObjectSP MsvcStlListFrontEnd::GetChildAtIndex(uint32_t idx) {
+  if (idx >= CalculateNumChildrenIgnoringErrors())
+    return lldb::ValueObjectSP();
+
+  if (!m_head || !m_tail)
+    return lldb::ValueObjectSP();
+
+  if (HasLoop(idx + 1))
+    return lldb::ValueObjectSP();
+
+  ValueObjectSP current_sp = GetItem(idx);
+  if (!current_sp)
+    return lldb::ValueObjectSP();
+
+  current_sp = current_sp->GetChildAtIndex(2); // get the _Myval child
+  if (!current_sp)
+    return lldb::ValueObjectSP();
+
+  // we need to copy current_sp into a new object otherwise we will end up with
+  // all items named _Myval
+  DataExtractor data;
+  Status error;
+  current_sp->GetData(data, error);
+  if (error.Fail())
+    return lldb::ValueObjectSP();
+
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromData(name.GetString(), data,
+                                   m_backend.GetExecutionContextRef(),
+                                   m_element_type);
+}
+
+lldb::ChildCacheState MsvcStlListFrontEnd::Update() {
+  AbstractListFrontEnd::Update();
+  m_tail = nullptr;
+  m_head = nullptr;
+
+  ValueObjectSP last =
+      m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"});
+  if (!last)
+    return lldb::ChildCacheState::eRefetch;
+  ValueObjectSP first = last->GetChildMemberWithName("_Next");
+  if (!first)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_head = first.get();
+  m_tail = last.get();
+
+  return lldb::ChildCacheState::eRefetch;
+}
+
 SyntheticChildrenFrontEnd *formatters::LibcxxStdListSyntheticFrontEndCreator(
     CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
-  return (valobj_sp ? new ListFrontEnd(valobj_sp) : nullptr);
+  return (valobj_sp ? new LibCxxListFrontEnd(valobj_sp) : nullptr);
 }
 
 SyntheticChildrenFrontEnd *
 formatters::LibcxxStdForwardListSyntheticFrontEndCreator(
     CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
-  return valobj_sp ? new ForwardListFrontEnd(*valobj_sp) : nullptr;
+  return valobj_sp ? new LibCxxForwardListFrontEnd(*valobj_sp) : nullptr;
+}
+
+bool formatters::IsMsvcStlList(ValueObject &valobj) {
+  if (auto valobj_sp = valobj.GetNonSyntheticValue())
+    return valobj_sp->GetChildMemberWithName("_Mypair") != nullptr;
+
+  return false;
+}
+
+SyntheticChildrenFrontEnd *
+formatters::MsvcStlListSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                                lldb::ValueObjectSP valobj_sp) {
+  return (valobj_sp ? new MsvcStlListFrontEnd(valobj_sp) : nullptr);
+}
+
+SyntheticChildrenFrontEnd *
+formatters::MsvcStlForwardListSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  return valobj_sp ? new MsvcStlForwardListFrontEnd(*valobj_sp) : nullptr;
 }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
index c80a52d0f9ed6..595e835b37df9 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp
@@ -366,3 +366,49 @@ bool lldb_private::formatters::LibStdcppSmartPointerSummaryProvider(
 
   return true;
 }
+
+static uint64_t LibStdcppVariantNposValue(size_t index_byte_size) {
+  switch (index_byte_size) {
+  case 1:
+    return 0xff;
+  case 2:
+    return 0xffff;
+  default:
+    return 0xffff'ffff;
+  }
+}
+
+bool formatters::LibStdcppVariantSummaryProvider(
+    ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
+  ValueObjectSP valobj_sp = valobj.GetNonSyntheticValue();
+  if (!valobj_sp)
+    return false;
+
+  ValueObjectSP index_obj = valobj_sp->GetChildMemberWithName("_M_index");
+  ValueObjectSP data_obj = valobj_sp->GetChildMemberWithName("_M_u");
+  if (!index_obj || !data_obj)
+    return false;
+
+  auto index_bytes = index_obj->GetByteSize();
+  if (!index_bytes)
+    return false;
+  auto npos_value = LibStdcppVariantNposValue(*index_bytes);
+  auto index = index_obj->GetValueAsUnsigned(0);
+  if (index == npos_value) {
+    stream.Printf(" No Value");
+    return true;
+  }
+
+  auto variant_type =
+      valobj_sp->GetCompilerType().GetCanonicalType().GetNonReferenceType();
+  if (!variant_type)
+    return false;
+  if (index >= variant_type.GetNumTemplateArguments(true)) {
+    stream.Printf(" <Invalid>");
+    return true;
+  }
+
+  auto active_type = variant_type.GetTypeTemplateArgument(index, true);
+  stream << " Active Type = " << active_type.GetDisplayTypeName() << " ";
+  return true;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h
index 8d4d777edee88..8d2025e940ead 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h
@@ -29,6 +29,10 @@ bool LibStdcppUniquePointerSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &options); // libstdc++ std::unique_ptr<>
 
+bool LibStdcppVariantSummaryProvider(
+    ValueObject &valobj, Stream &stream,
+    const TypeSummaryOptions &options); // libstdc++ std::variant<>
+
 SyntheticChildrenFrontEnd *
 LibstdcppMapIteratorSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                              lldb::ValueObjectSP);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
index fe75bf275f8e2..0f3db4b50eeaf 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h
@@ -45,6 +45,26 @@ bool MsvcStlUniquePtrSummaryProvider(ValueObject &valobj, Stream &stream,
 lldb_private::SyntheticChildrenFrontEnd *
 MsvcStlUniquePtrSyntheticFrontEndCreator(lldb::ValueObjectSP valobj_sp);
 
+// MSVC STL std::tuple<>
+bool IsMsvcStlTuple(ValueObject &valobj);
+SyntheticChildrenFrontEnd *
+MsvcStlTupleSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                     lldb::ValueObjectSP valobj_sp);
+
+// MSVC STL std::vector<>
+bool IsMsvcStlVector(ValueObject &valobj);
+lldb_private::SyntheticChildrenFrontEnd *
+MsvcStlVectorSyntheticFrontEndCreator(lldb::ValueObjectSP valobj_sp);
+
+// MSVC STL std::list and std::forward_list
+bool IsMsvcStlList(ValueObject &valobj);
+SyntheticChildrenFrontEnd *
+MsvcStlForwardListSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                           lldb::ValueObjectSP valobj_sp);
+SyntheticChildrenFrontEnd *
+MsvcStlListSyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                    lldb::ValueObjectSP valobj_sp);
+
 } // namespace formatters
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlTuple.cpp
new file mode 100644
index 0000000000000..fe20b4c141a65
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlTuple.cpp
@@ -0,0 +1,105 @@
+//===-- MsvcStlTuple.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MsvcStl.h"
+#include "lldb/DataFormatters/FormattersHelpers.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+namespace {
+
+class TupleFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  TupleFrontEnd(ValueObject &valobj) : SyntheticChildrenFrontEnd(valobj) {
+    Update();
+  }
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override {
+    auto optional_idx = formatters::ExtractIndexFromString(name.GetCString());
+    if (!optional_idx) {
+      return llvm::createStringError("Type has no child named '%s'",
+                                     name.AsCString());
+    }
+    return *optional_idx;
+  }
+
+  lldb::ChildCacheState Update() override;
+  llvm::Expected<uint32_t> CalculateNumChildren() override {
+    return m_elements.size();
+  }
+  ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+private:
+  // The lifetime of a ValueObject and all its derivative ValueObjects
+  // (children, clones, etc.) is managed by a ClusterManager. These
+  // objects are only destroyed when every shared pointer to any of them
+  // is destroyed, so we must not store a shared pointer to any ValueObject
+  // derived from our backend ValueObject (since we're in the same cluster).
+  std::vector<ValueObject *> m_elements;
+};
+
+} // namespace
+
+lldb::ChildCacheState TupleFrontEnd::Update() {
+  m_elements.clear();
+
+  size_t n_elements = 0;
+  for (CompilerType ty = m_backend.GetCompilerType();
+       ty.GetNumDirectBaseClasses() > 0;
+       ty = ty.GetDirectBaseClassAtIndex(0, nullptr))
+    ++n_elements;
+
+  m_elements.assign(n_elements, nullptr);
+  return lldb::ChildCacheState::eRefetch;
+}
+
+ValueObjectSP TupleFrontEnd::GetChildAtIndex(uint32_t idx) {
+  if (idx >= m_elements.size())
+    return nullptr;
+  if (m_elements[idx])
+    return m_elements[idx]->GetSP();
+
+  CompilerType holder_ty = m_backend.GetCompilerType();
+  for (uint32_t i = 0; i < idx; i++) {
+    holder_ty = holder_ty.GetDirectBaseClassAtIndex(0, nullptr);
+    if (!holder_ty.IsValid())
+      return nullptr;
+  }
+
+  ValueObjectSP holder_sp = m_backend.Cast(holder_ty);
+  if (!holder_sp)
+    return nullptr;
+  holder_sp = holder_sp->GetChildMemberWithName("_Myfirst");
+
+  if (!holder_sp)
+    return nullptr;
+
+  ValueObjectSP val_sp = holder_sp->GetChildMemberWithName("_Val");
+  if (!val_sp)
+    return nullptr;
+
+  m_elements[idx] =
+      val_sp->Clone(ConstString(llvm::formatv("[{0}]", idx).str())).get();
+  return m_elements[idx]->GetSP();
+}
+
+bool formatters::IsMsvcStlTuple(ValueObject &valobj) {
+  // This returns false for empty tuples, but the libstdc++ formatter handles
+  // this correctly.
+  if (auto valobj_sp = valobj.GetNonSyntheticValue())
+    return valobj_sp->GetChildMemberWithName("_Myfirst") != nullptr;
+  return false;
+}
+
+SyntheticChildrenFrontEnd *formatters::MsvcStlTupleSyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (valobj_sp)
+    return new TupleFrontEnd(*valobj_sp);
+  return nullptr;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp
new file mode 100644
index 0000000000000..cfc98d27f56d6
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp
@@ -0,0 +1,305 @@
+//===-- MsvcStlVector.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MsvcStl.h"
+
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include "lldb/DataFormatters/TypeSynthetic.h"
+
+using namespace lldb;
+
+namespace lldb_private {
+namespace formatters {
+
+class MsvcStlVectorSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  MsvcStlVectorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  ValueObject *m_start = nullptr;
+  ValueObject *m_finish = nullptr;
+  CompilerType m_element_type;
+  uint32_t m_element_size = 0;
+};
+
+class MsvcStlVectorBoolSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  MsvcStlVectorBoolSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  CompilerType m_bool_type;
+  ExecutionContextRef m_exe_ctx_ref;
+  uint64_t m_count = 0;
+  uint64_t m_element_bit_size = 0;
+  lldb::addr_t m_base_data_address = 0;
+  std::map<size_t, lldb::ValueObjectSP> m_children;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::
+    MsvcStlVectorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() {
+  if (valobj_sp)
+    Update();
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    MsvcStlVectorSyntheticFrontEnd::CalculateNumChildren() {
+  if (!m_start || !m_finish)
+    return llvm::createStringError(
+        "Failed to determine start/end of vector data.");
+
+  uint64_t start_val = m_start->GetValueAsUnsigned(0);
+  uint64_t finish_val = m_finish->GetValueAsUnsigned(0);
+
+  // A default-initialized empty vector.
+  if (start_val == 0 && finish_val == 0)
+    return 0;
+
+  if (start_val == 0)
+    return llvm::createStringError("Invalid value for start of vector.");
+
+  if (finish_val == 0)
+    return llvm::createStringError("Invalid value for end of vector.");
+
+  if (start_val > finish_val)
+    return llvm::createStringError(
+        "Start of vector data begins after end pointer.");
+
+  size_t num_children = (finish_val - start_val);
+  if (num_children % m_element_size)
+    return llvm::createStringError("Size not multiple of element size.");
+
+  return num_children / m_element_size;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  if (!m_start || !m_finish)
+    return lldb::ValueObjectSP();
+
+  uint64_t offset = idx * m_element_size;
+  offset = offset + m_start->GetValueAsUnsigned(0);
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromAddress(name.GetString(), offset,
+                                      m_backend.GetExecutionContextRef(),
+                                      m_element_type);
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::Update() {
+  m_start = m_finish = nullptr;
+  ValueObjectSP data_sp(m_backend.GetChildAtNamePath({"_Mypair", "_Myval2"}));
+
+  if (!data_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_start = data_sp->GetChildMemberWithName("_Myfirst").get();
+  m_finish = data_sp->GetChildMemberWithName("_Mylast").get();
+  if (!m_start || !m_finish)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_element_type = m_start->GetCompilerType().GetPointeeType();
+  llvm::Expected<uint64_t> size_or_err = m_element_type.GetByteSize(nullptr);
+  if (size_or_err)
+    m_element_size = *size_or_err;
+  else
+    LLDB_LOG_ERRORV(GetLog(LLDBLog::DataFormatters), size_or_err.takeError(),
+                    "{0}");
+
+  return lldb::ChildCacheState::eRefetch;
+}
+
+llvm::Expected<size_t> lldb_private::formatters::
+    MsvcStlVectorSyntheticFrontEnd::GetIndexOfChildWithName(ConstString name) {
+  if (!m_start || !m_finish)
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  auto optional_idx = ExtractIndexFromString(name.GetCString());
+  if (!optional_idx) {
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  }
+  return *optional_idx;
+}
+
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::
+    MsvcStlVectorBoolSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_bool_type(), m_exe_ctx_ref(),
+      m_children() {
+  if (valobj_sp) {
+    Update();
+    m_bool_type =
+        valobj_sp->GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeBool);
+  }
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    MsvcStlVectorBoolSyntheticFrontEnd::CalculateNumChildren() {
+  return m_count;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  auto iter = m_children.find(idx), end = m_children.end();
+  if (iter != end)
+    return iter->second;
+  if (idx >= m_count)
+    return {};
+  if (m_base_data_address == 0 || m_count == 0)
+    return {};
+  if (!m_bool_type)
+    return {};
+
+  // The vector<bool> is represented as a sequence of `int`s.
+  // The size of an `int` is in `m_element_bit_size` (most often 32b).
+  // To access the element at index `i`:
+  // (bool)((data_address[i / bit_size] >> (i % bit_size)) & 1)
+
+  // int *byte_location = &data_address[i / bit_size]
+  size_t byte_idx = (idx / m_element_bit_size) * (m_element_bit_size / 8);
+  lldb::addr_t byte_location = m_base_data_address + byte_idx;
+
+  ProcessSP process_sp(m_exe_ctx_ref.GetProcessSP());
+  if (!process_sp)
+    return {};
+  Status err;
+  Scalar scalar;
+  size_t bytes_read = process_sp->ReadScalarIntegerFromMemory(
+      byte_location, m_element_bit_size / 8, false, scalar, err);
+  if (err.Fail() || bytes_read == 0 || !scalar.IsValid())
+    return {};
+
+  size_t bit_index = idx % m_element_bit_size;
+  bool bit_set = scalar.GetAPSInt()[bit_index];
+  std::optional<uint64_t> size =
+      llvm::expectedToOptional(m_bool_type.GetByteSize(nullptr));
+  if (!size)
+    return {};
+  WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0));
+  if (bit_set && buffer_sp && buffer_sp->GetBytes()) {
+    // regardless of endianness, anything non-zero is true
+    *(buffer_sp->GetBytes()) = 1;
+  }
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  ValueObjectSP retval_sp(CreateValueObjectFromData(
+      name.GetString(),
+      DataExtractor(buffer_sp, process_sp->GetByteOrder(),
+                    process_sp->GetAddressByteSize()),
+      m_exe_ctx_ref, m_bool_type));
+  if (retval_sp)
+    m_children[idx] = retval_sp;
+  return retval_sp;
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::Update() {
+  m_exe_ctx_ref.Clear();
+  m_count = 0;
+  m_element_bit_size = 0;
+  m_base_data_address = 0;
+  m_children.clear();
+
+  ValueObjectSP valobj_sp = m_backend.GetSP();
+  if (!valobj_sp)
+    return lldb::ChildCacheState::eRefetch;
+  auto exe_ctx_ref = valobj_sp->GetExecutionContextRef();
+
+  ValueObjectSP size_sp = valobj_sp->GetChildMemberWithName("_Mysize");
+  if (!size_sp)
+    return lldb::ChildCacheState::eRefetch;
+  uint64_t count = size_sp->GetValueAsUnsigned(0);
+  if (count == 0)
+    return lldb::ChildCacheState::eReuse;
+
+  ValueObjectSP begin_sp(valobj_sp->GetChildAtNamePath(
+      {"_Myvec", "_Mypair", "_Myval2", "_Myfirst"}));
+  if (!begin_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  // FIXME: the STL exposes _EEN_VBITS as a constant - it should be used instead
+  CompilerType begin_ty = begin_sp->GetCompilerType().GetPointeeType();
+  if (!begin_ty.IsValid())
+    return lldb::ChildCacheState::eRefetch;
+  llvm::Expected<uint64_t> element_bit_size = begin_ty.GetBitSize(nullptr);
+  if (!element_bit_size)
+    return lldb::ChildCacheState::eRefetch;
+
+  uint64_t base_data_address = begin_sp->GetValueAsUnsigned(0);
+  if (!base_data_address)
+    return lldb::ChildCacheState::eRefetch;
+
+  m_exe_ctx_ref = exe_ctx_ref;
+  m_count = count;
+  m_element_bit_size = *element_bit_size;
+  m_base_data_address = base_data_address;
+  return lldb::ChildCacheState::eRefetch;
+}
+
+llvm::Expected<size_t>
+lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (!m_count || !m_base_data_address)
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  auto optional_idx = ExtractIndexFromString(name.AsCString());
+  if (!optional_idx) {
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  }
+  uint32_t idx = *optional_idx;
+  if (idx >= CalculateNumChildrenIgnoringErrors())
+    return llvm::createStringError("Type has no child named '%s'",
+                                   name.AsCString());
+  return idx;
+}
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::MsvcStlVectorSyntheticFrontEndCreator(
+    lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+
+  valobj_sp = valobj_sp->GetNonSyntheticValue();
+  if (!valobj_sp)
+    return nullptr;
+
+  // We can't check the template parameter here, because PDB doesn't include
+  // this information.
+
+  // vector<T>
+  if (valobj_sp->GetChildMemberWithName("_Mypair") != nullptr)
+    return new MsvcStlVectorSyntheticFrontEnd(valobj_sp);
+  // vector<bool>
+  if (valobj_sp->GetChildMemberWithName("_Myvec") != nullptr)
+    return new MsvcStlVectorBoolSyntheticFrontEnd(valobj_sp);
+
+  return nullptr;
+}
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index ef691b77193ce..58ebb7be11994 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -108,13 +108,21 @@ MinidumpParser::GetThreadContext(const minidump::Thread &td) {
 
 llvm::ArrayRef<uint8_t>
 MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) {
+  Log *log = GetLog(LLDBLog::Process);
   // On Windows, a 32-bit process can run on a 64-bit machine under WOW64. If
   // the minidump was captured with a 64-bit debugger, then the CONTEXT we just
   // grabbed from the mini_dump_thread is the one for the 64-bit "native"
   // process rather than the 32-bit "guest" process we care about.  In this
   // case, we can get the 32-bit CONTEXT from the TEB (Thread Environment
   // Block) of the 64-bit process.
-  auto teb_mem = GetMemory(td.EnvironmentBlock, sizeof(TEB64));
+  auto teb_mem_maybe = GetMemory(td.EnvironmentBlock, sizeof(TEB64));
+  if (!teb_mem_maybe) {
+    LLDB_LOG_ERROR(log, teb_mem_maybe.takeError(),
+                   "Failed to read Thread Environment Block: {0}");
+    return {};
+  }
+
+  auto teb_mem = *teb_mem_maybe;
   if (teb_mem.empty())
     return {};
 
@@ -126,8 +134,16 @@ MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) {
   // Slot 1 of the thread-local storage in the 64-bit TEB points to a structure
   // that includes the 32-bit CONTEXT (after a ULONG). See:
   // https://msdn.microsoft.com/en-us/library/ms681670.aspx
-  auto context =
+  auto context_maybe =
       GetMemory(wow64teb->tls_slots[1] + 4, sizeof(MinidumpContext_x86_32));
+  if (!context_maybe) {
+    LLDB_LOG_ERROR(log, context_maybe.takeError(),
+                   "Failed to read WOW Thread Context: {0}");
+    return {};
+  }
+
+  auto context = *context_maybe;
+
   if (context.size() < sizeof(MinidumpContext_x86_32))
     return {};
 
@@ -478,11 +494,13 @@ void MinidumpParser::PopulateMemoryRanges() {
   m_memory_ranges.Sort();
 }
 
-llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
-                                                  size_t size) {
+llvm::Expected<llvm::ArrayRef<uint8_t>>
+MinidumpParser::GetMemory(lldb::addr_t addr, size_t size) {
   std::optional<minidump::Range> range = FindMemoryRange(addr);
   if (!range)
-    return {};
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "No memory range found for address (0x%" PRIx64 ")", addr);
 
   // There's at least some overlap between the beginning of the desired range
   // (addr) and the current range.  Figure out where the overlap begins and
@@ -491,7 +509,11 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
   const size_t offset = addr - range->start;
 
   if (addr < range->start || offset >= range->range_ref.size())
-    return {};
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "Address (0x%" PRIx64 ") is not in range [0x%" PRIx64 " - 0x%" PRIx64
+        ")",
+        addr, range->start, range->start + range->range_ref.size());
 
   const size_t overlap = std::min(size, range->range_ref.size() - offset);
   return range->range_ref.slice(offset, overlap);
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index 14599f8d572aa..3b7d33daca717 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -104,7 +104,8 @@ class MinidumpParser {
 
   std::optional<Range> FindMemoryRange(lldb::addr_t addr);
 
-  llvm::ArrayRef<uint8_t> GetMemory(lldb::addr_t addr, size_t size);
+  llvm::Expected<llvm::ArrayRef<uint8_t>> GetMemory(lldb::addr_t addr,
+                                                    size_t size);
 
   /// Returns a list of memory regions and a flag indicating whether the list is
   /// complete (includes all regions mapped into the process memory).
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index ef3c00e2857df..17a421a722743 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -322,12 +322,15 @@ size_t ProcessMinidump::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
 size_t ProcessMinidump::DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
                                      Status &error) {
 
-  llvm::ArrayRef<uint8_t> mem = m_minidump_parser->GetMemory(addr, size);
-  if (mem.empty()) {
-    error = Status::FromErrorString("could not parse memory info");
+  llvm::Expected<llvm::ArrayRef<uint8_t>> mem_maybe =
+      m_minidump_parser->GetMemory(addr, size);
+  if (!mem_maybe) {
+    error = Status::FromError(mem_maybe.takeError());
     return 0;
   }
 
+  llvm::ArrayRef<uint8_t> mem = *mem_maybe;
+
   std::memcpy(buf, mem.data(), mem.size());
   return mem.size();
 }
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index bafe9d56a93bf..e847ede1a4ba6 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -474,20 +474,6 @@ static void ParseLangArgs(LangOptions &Opts, ArchSpec arch) {
   // specified, or -std is set to a conforming mode.
   Opts.Trigraphs = !Opts.GNUMode;
   Opts.CharIsSigned = arch.CharIsSignedByDefault();
-  Opts.OptimizeSize = 0;
-
-  // FIXME: Eliminate this dependency.
-  //    unsigned Opt =
-  //    Args.hasArg(OPT_Os) ? 2 : getLastArgIntValue(Args, OPT_O, 0, Diags);
-  //    Opts.Optimize = Opt != 0;
-  unsigned Opt = 0;
-
-  // This is the __NO_INLINE__ define, which just depends on things like the
-  // optimization level and -fno-inline, not actually whether the backend has
-  // inlining enabled.
-  //
-  // FIXME: This is affected by other options (-fno-inline).
-  Opts.NoInlineDefine = !Opt;
 
   // This is needed to allocate the extra space for the owning module
   // on each decl.
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index 4aa9e046d6077..e6cd48a9d3dad 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -5,8 +5,8 @@ let Definition = "target_experimental" in {
     Global, DefaultTrue,
     Desc<"If true, inject local variables explicitly into the expression text. This will fix symbol resolution when there are name collisions between ivars and local variables. But it can make expressions run much more slowly.">;
   def UseDIL : Property<"use-DIL", "Boolean">,
-    Global, DefaultFalse,
-    Desc<"If true, use the alternative DIL implementation for frame variable evaluation.">;
+    Global, DefaultTrue,
+    Desc<"If true, use the DIL implementation for frame variable evaluation.">;
 }
 
 let Definition = "target" in {
@@ -99,7 +99,7 @@ let Definition = "target" in {
     DefaultUnsignedValue<24>,
     Desc<"Maximum number of children to expand in any level of depth.">;
   def MaxChildrenDepth: Property<"max-children-depth", "UInt64">,
-    DefaultUnsignedValue<0xFFFFFFFF>,
+    DefaultUnsignedValue<4>,
     Desc<"Maximum depth to expand children.">;
   def MaxSummaryLength: Property<"max-string-summary-length", "UInt64">,
     DefaultUnsignedValue<1024>,
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 70b9800f4dade..7c71aaae6bcf2 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -228,9 +228,9 @@ static const CoreDefinition g_core_definitions[] = {
     {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon,
      ArchSpec::eCore_hexagon_hexagonv5, "hexagonv5"},
 
-    {eByteOrderLittle, 4, 2, 4, llvm::Triple::riscv32, ArchSpec::eCore_riscv32,
+    {eByteOrderLittle, 4, 2, 8, llvm::Triple::riscv32, ArchSpec::eCore_riscv32,
      "riscv32"},
-    {eByteOrderLittle, 8, 2, 4, llvm::Triple::riscv64, ArchSpec::eCore_riscv64,
+    {eByteOrderLittle, 8, 2, 8, llvm::Triple::riscv64, ArchSpec::eCore_riscv64,
      "riscv64"},
 
     {eByteOrderLittle, 4, 4, 4, llvm::Triple::loongarch32,
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index fd3f9f8724608..6f28434c646cd 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -303,7 +303,7 @@ Interpreter::Visit(const MemberOfNode *node) {
     }
   }
 
-  if (field_obj && field_obj->GetName() == node->GetFieldName()) {
+  if (field_obj) {
     if (m_use_dynamic != lldb::eNoDynamicValues) {
       lldb::ValueObjectSP dynamic_val_sp =
           field_obj->GetDynamicValue(m_use_dynamic);
diff --git a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
index c0ef29fab8597..0f56057189395 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
@@ -98,7 +98,6 @@ def test_subscript(self):
             substrs=["subscript of pointer to incomplete type 'void'"],
         )
 
-    @expectedFailureAll(oslist=["windows"])
     def test_subscript_synthetic(self):
         self.build()
         lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
index b2ce9602e6a50..6f00b9e0cfb6c 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
@@ -18,6 +18,7 @@ class TestFrameVarDILQualifiedId(TestBase):
     # each debug info format.
     NO_DEBUG_INFO_TESTCASE = True
 
+    @skipIfWindows
     def test_frame_var(self):
         self.build()
         lldbutil.run_to_source_breakpoint(
@@ -29,3 +30,17 @@ def test_frame_var(self):
         self.expect_var_path("ns::i", value="1")
         self.expect_var_path("::ns::ns::i", value="2")
         self.expect_var_path("ns::ns::i", value="2")
+
+        self.expect_var_path("foo", value="1")
+        self.expect_var_path("::(anonymous namespace)::foo", value="13")
+        self.expect_var_path("(anonymous namespace)::foo", value="13")
+        self.expect_var_path("ns1::(anonymous namespace)::foo", value="5")
+        self.expect_var_path(
+            "(anonymous namespace)::ns2::(anonymous namespace)::foo",
+            value="7",
+        )
+        self.expect_var_path("::ns1::(anonymous namespace)::foo", value="5")
+        self.expect_var_path(
+            "::(anonymous namespace)::ns2::(anonymous namespace)::foo",
+            value="7",
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
index 8a5c47a6f364c..10ffa1e54a991 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
+++ b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
@@ -10,7 +10,26 @@ int i = 2;
 
 } // namespace ns
 
+namespace {
+int foo = 13;
+}
+
+namespace ns1 {
+namespace {
+int foo = 5;
+}
+} // namespace ns1
+
+namespace {
+namespace ns2 {
+namespace {
+int foo = 7;
+}
+} // namespace ns2
+} // namespace
+
 int main(int argc, char **argv) {
+  int foo = 1;
 
-  return 0; // Set a breakpoint here
+  return foo + ::foo + ns1::foo + ns2::foo; // Set a breakpoint here
 }
diff --git a/lldb/test/API/commands/plugin/TestPlugin.py b/lldb/test/API/commands/plugin/TestPlugin.py
index fdfb14bfcc24e..bd8ab9604538e 100644
--- a/lldb/test/API/commands/plugin/TestPlugin.py
+++ b/lldb/test/API/commands/plugin/TestPlugin.py
@@ -60,3 +60,49 @@ def do_list_disable_enable_test(self, plugin_namespace):
         self.expect(
             f"plugin enable {plugin_namespace}", substrs=[plugin_namespace, "[+]"]
         )
+
+    def test_completions(self):
+        # Make sure completions work for the plugin list, enable, and disable commands.
+        # We just check a few of the expected plugins to make sure the completion works.
+        self.completions_contain(
+            "plugin list ", ["abi", "architecture", "disassembler"]
+        )
+        self.completions_contain(
+            "plugin enable ", ["abi", "architecture", "disassembler"]
+        )
+        self.completions_contain(
+            "plugin disable ", ["abi", "architecture", "disassembler"]
+        )
+
+        # A completion for a partial namespace should be the full namespace.
+        # This allows the user to run the command on the full namespace.
+        self.completions_match("plugin list ab", ["abi"])
+        self.completions_contain(
+            "plugin list object", ["object-container", "object-file"]
+        )
+
+        # A completion for a full namespace should contain the plugins in that namespace.
+        self.completions_contain("plugin list object-file", ["object-file.JSON"])
+        self.completions_contain("plugin list object-file.", ["object-file.JSON"])
+        self.completions_contain("plugin list object-file.J", ["object-file.JSON"])
+        self.completions_contain("plugin list object-file.JS", ["object-file.JSON"])
+
+        # Check for a completion that is a both a complete namespace and a prefix of
+        # another namespace. It should return the completions for the plugins in the completed
+        # namespace as well as the completion for the partial namespace.
+        self.completions_contain(
+            "plugin list language", ["language.cplusplus", "language-runtime"]
+        )
+
+        # When the namespace is a prefix of another namespace and the user types a dot, the
+        # completion should not include the match for the partial namespace.
+        self.completions_contain(
+            "plugin list language.", ["language.cplusplus"], match=True
+        )
+        self.completions_contain(
+            "plugin list language.", ["language-runtime"], match=False
+        )
+
+        # Check for an empty completion list when the names is invalid.
+        # See docs for `complete_from_to` for how this checks for an empty list.
+        self.complete_from_to("plugin list abi.foo", ["plugin list abi.foo"])
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-advanced/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-advanced/main.cpp
index 9d12ca30f984c..32f5f2ce436fe 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-advanced/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-advanced/main.cpp
@@ -165,6 +165,7 @@ int main (int argc, const char * argv[])
     Simple a_simple_object(3,0.14,'E');
     
     VeryLong a_long_guy;
+    auto *unused = &a_long_guy; // ensure a_long_guy isn't optimized out
 
     std::string some_string = "012345678901234567890123456789"
                               "012345678901234567890123456789"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py b/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py
index 20f49e02adcea..e6aeef2bedff2 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py
@@ -16,10 +16,6 @@ def setUp(self):
         # Find the line number to break at.
         self.line = line_number("main.cpp", "// Set break point at this line.")
 
-    @expectedFailureAll(
-        oslist=["windows"],
-        bugnumber="llvm.org/pr24462, Data formatters have problems on Windows",
-    )
     def test_with_run_command(self):
         """Check that we can properly disable all data formatter categories."""
         self.build()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
index f63f8fe1d6a62..45695c43b42a9 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
@@ -7,9 +7,6 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class TestDataFormatterGenericForwardList(TestBase):
     def setUp(self):
@@ -17,9 +14,8 @@ def setUp(self):
         self.line = line_number("main.cpp", "// break here")
         self.namespace = "std"
 
-    def do_test(self, stdlib_type):
+    def do_test(self):
         """Test that std::forward_list is displayed correctly"""
-        self.build(dictionary={stdlib_type: "1"})
         lldbutil.run_to_source_breakpoint(
             self, "// break here", lldb.SBFileSpec("main.cpp", False)
         )
@@ -76,10 +72,8 @@ def do_test(self, stdlib_type):
             substrs=["size=24", "[0]", "[1]", "[2]", "..."],
         )
 
-    def do_test_ptr_and_ref(self, stdlib_type):
+    def do_test_ptr_and_ref(self):
         """Test that ref and ptr to std::forward_list is displayed correctly"""
-        self.build(dictionary={stdlib_type: "1"})
-
         (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "Check ref and ptr", lldb.SBFileSpec("main.cpp", False)
         )
@@ -158,16 +152,31 @@ def do_test_ptr_and_ref(self, stdlib_type):
 
     @add_test_categories(["libstdcxx"])
     def test_libstdcpp(self):
-        self.do_test(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test()
 
     @add_test_categories(["libstdcxx"])
     def test_ptr_and_ref_libstdcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_ptr_and_ref()
 
     @add_test_categories(["libc++"])
     def test_libcpp(self):
-        self.do_test(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test()
 
     @add_test_categories(["libc++"])
     def test_ptr_and_ref_libcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_ptr_and_ref()
+
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_ptr_and_ref_msvcstl(self):
+        self.build()
+        self.do_test_ptr_and_ref()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
index 78c93b1e3caea..c0207e6ab5911 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
@@ -8,9 +8,6 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class GenericListDataFormatterTestCase(TestBase):
     def setUp(self):
@@ -25,9 +22,8 @@ def setUp(self):
             "main.cpp", "// Set final break point at this line."
         )
 
-    def do_test_with_run_command(self, stdlib_type):
+    def do_test_with_run_command(self, *, is_libstdcpp=False):
         """Test that that file and class static variables display correctly."""
-        self.build(dictionary={stdlib_type: "1"})
         self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
 
         lldbutil.run_break_set_by_file_and_line(
@@ -62,7 +58,7 @@ def cleanup():
             "frame variable numbers_list --raw", matching=False, substrs=["size=0"]
         )
 
-        if stdlib_type == USE_LIBSTDCPP:
+        if is_libstdcpp:
             self.expect(
                 "frame variable &numbers_list._M_impl._M_node --raw",
                 matching=False,
@@ -230,10 +226,8 @@ def cleanup():
             "text_list.MightHaveChildren() says False for non empty!",
         )
 
-    def do_test_ptr_and_ref(self, stdlib_type):
+    def do_test_ptr_and_ref(self):
         """Test that ref and ptr to std::list is displayed correctly"""
-        self.build(dictionary={stdlib_type: "1"})
-
         (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "Check ref and ptr", lldb.SBFileSpec("main.cpp", False)
         )
@@ -302,16 +296,31 @@ def do_test_ptr_and_ref(self, stdlib_type):
 
     @add_test_categories(["libstdcxx"])
     def test_with_run_command_libstdcpp(self):
-        self.do_test_with_run_command(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_with_run_command(is_libstdcpp=True)
 
     @add_test_categories(["libstdcxx"])
     def test_ptr_and_ref_libstdcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_ptr_and_ref()
 
     @add_test_categories(["libc++"])
     def test_with_run_command_libcpp(self):
-        self.do_test_with_run_command(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_with_run_command()
 
     @add_test_categories(["libc++"])
     def test_ptr_and_ref_libcpp(self):
-        self.do_test_ptr_and_ref(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_ptr_and_ref()
+
+    @add_test_categories(["msvcstl"])
+    def test_with_run_command_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test_with_run_command()
+
+    @add_test_categories(["msvcstl"])
+    def test_ptr_and_ref_msvcstl(self):
+        self.build()
+        self.do_test_ptr_and_ref()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
index 039c703491759..f6174dd786380 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
@@ -9,15 +9,11 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 
-USE_LIBSTDCPP = "USE_LIBSTDCPP"
-USE_LIBCPP = "USE_LIBCPP"
-
 
 class GenericListDataFormatterTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    def do_test_with_run_command(self, stdlib_type):
-        self.build(dictionary={stdlib_type: "1"})
+    def do_test_with_run_command(self):
         exe = self.getBuildArtifact("a.out")
         target = self.dbg.CreateTarget(exe)
         self.assertTrue(target and target.IsValid(), "Target is valid")
@@ -64,8 +60,16 @@ def do_test_with_run_command(self, stdlib_type):
 
     @add_test_categories(["libstdcxx"])
     def test_with_run_command_libstdcpp(self):
-        self.do_test_with_run_command(USE_LIBSTDCPP)
+        self.build(dictionary={"USE_LIBSTDCPP": 1})
+        self.do_test_with_run_command()
 
     @add_test_categories(["libc++"])
     def test_with_run_command_libcpp(self):
-        self.do_test_with_run_command(USE_LIBCPP)
+        self.build(dictionary={"USE_LIBCPP": 1})
+        self.do_test_with_run_command()
+
+    @add_test_categories(["msvcstl"])
+    def test_with_run_command_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test_with_run_command()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp
index e797b3d04dd6b..b31d4ca909ecb 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp
@@ -1,8 +1,3 @@
-// Evil hack: To simulate memory corruption, we want to fiddle with some internals of std::list.
-// Make those accessible to us.
-#define private public
-#define protected public
-
 #include <list>
 #include <stdio.h>
 #include <assert.h>
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
index ade502e12b928..b23d549fe4c18 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/TestDataFormatterStdTuple.py
@@ -74,3 +74,9 @@ def test_libcxx(self):
     def test_libstdcxx(self):
         self.build(dictionary={"USE_LIBSTDCPP": 1})
         self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/main.cpp
index d49dbe8a5f1af..53d0a4efd3865 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/tuple/main.cpp
@@ -6,5 +6,6 @@ int main() {
   std::tuple<int> one_elt{47};
   std::tuple<std::string> string_elt{"foobar"};
   std::tuple<int, long, std::string> three_elts{1, 47l, "foo"};
+  auto *foo = &empty; // needed with MSVC STL to keep the variable
   return 0; // break here
 }
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
index 56c86d1edde25..dd142d2be193b 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py
@@ -47,7 +47,7 @@ def cleanup():
         self.expect(
             "frame variable -A vBool",
             substrs=[
-                "size=49",
+                "size=73",
                 "[0] = false",
                 "[1] = true",
                 "[18] = false",
@@ -55,13 +55,20 @@ def cleanup():
                 "[36] = false",
                 "[47] = true",
                 "[48] = true",
+                "[49] = true",
+                "[50] = false",
+                "[56] = false",
+                "[65] = true",
+                "[70] = false",
+                "[71] = true",
+                "[72] = true",
             ],
         )
 
         self.expect(
             "expr -A -- vBool",
             substrs=[
-                "size=49",
+                "size=73",
                 "[0] = false",
                 "[1] = true",
                 "[18] = false",
@@ -69,6 +76,13 @@ def cleanup():
                 "[36] = false",
                 "[47] = true",
                 "[48] = true",
+                "[49] = true",
+                "[50] = false",
+                "[56] = false",
+                "[65] = true",
+                "[70] = false",
+                "[71] = true",
+                "[72] = true",
             ],
         )
 
@@ -88,3 +102,9 @@ def test_libstdcxx_debug(self):
             dictionary={"USE_LIBSTDCPP": 1, "CXXFLAGS_EXTRAS": "-D_GLIBCXX_DEBUG"}
         )
         self.do_test()
+
+    @add_test_categories(["msvcstl"])
+    def test_libstdcxx(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp
index 22fc6c89ca8a2..2c54166ace7cc 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp
@@ -1,10 +1,10 @@
 #include <cstdio>
-#include <string>
 #include <vector>
 
 int main() {
   std::vector<bool> vBool;
 
+  // 0..=7
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -14,6 +14,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 8..=15
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -23,6 +24,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 16..=23
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -32,6 +34,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 24..=31
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -41,6 +44,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 32..=39
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -50,6 +54,7 @@ int main() {
   vBool.push_back(false);
   vBool.push_back(true);
 
+  // 40..=47
   vBool.push_back(false);
   vBool.push_back(true);
   vBool.push_back(false);
@@ -58,6 +63,38 @@ int main() {
   vBool.push_back(true);
   vBool.push_back(false);
   vBool.push_back(true);
+
+  // 48..=55
+  vBool.push_back(true);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+
+  // 56..=63
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+
+  // 64..=71
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+  vBool.push_back(true);
+  vBool.push_back(true);
+  vBool.push_back(false);
+  vBool.push_back(true);
+
+  // 72
   vBool.push_back(true);
 
   std::puts("// Set break point at this line.");
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py
index ba8b10450f4fc..d4da60f86a315 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py
@@ -184,6 +184,12 @@ def test_libcxx(self):
         self.build(dictionary={"USE_LIBCPP": 1})
         self.do_test()
 
+    @add_test_categories(["msvcstl"])
+    def test_msvcstl(self):
+        # No flags, because the "msvcstl" category checks that the MSVC STL is used by default.
+        self.build()
+        self.do_test()
+
     def do_test_ref_and_ptr(self):
         """Test that that file and class static variables display correctly."""
         (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
@@ -215,3 +221,8 @@ def test_ref_and_ptr_libstdcxx_debug(self):
     def test_ref_and_ptr_libcxx(self):
         self.build(dictionary={"USE_LIBCPP": 1})
         self.do_test_ref_and_ptr()
+
+    @add_test_categories(["msvcstl"])
+    def test_ref_and_ptr_msvcstl(self):
+        self.build()
+        self.do_test_ref_and_ptr()
diff --git a/lldb/test/API/lang/cpp/template/TestTemplateArgs.py b/lldb/test/API/lang/cpp/template/TestTemplateArgs.py
index 9708e98a59fce..2fd05d6404650 100644
--- a/lldb/test/API/lang/cpp/template/TestTemplateArgs.py
+++ b/lldb/test/API/lang/cpp/template/TestTemplateArgs.py
@@ -82,7 +82,6 @@ def test_integer_args(self):
             'expr_result.GetType().GetName() == "int"',
         )
 
-    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24489")
     def test_template_template_args(self):
         frame = self.prepareProcess()
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 83713213ce1fe..1e0c7d5e1792a 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -130,14 +130,14 @@ def delete_module_cache(path):
     config.environment["MallocNanoZone"] = "0"
     if "Address" in config.llvm_use_sanitizer:
         config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1"
-        if "Darwin" in config.host_os:
+        if "Darwin" in config.target_os:
             config.environment["DYLD_INSERT_LIBRARIES"] = find_sanitizer_runtime(
                 "libclang_rt.asan_osx_dynamic.dylib"
             )
 
     if "Thread" in config.llvm_use_sanitizer:
         config.environment["TSAN_OPTIONS"] = "halt_on_error=1"
-        if "Darwin" in config.host_os:
+        if "Darwin" in config.target_os:
             config.environment["DYLD_INSERT_LIBRARIES"] = find_sanitizer_runtime(
                 "libclang_rt.tsan_osx_dynamic.dylib"
             )
@@ -271,9 +271,15 @@ def delete_module_cache(path):
 if is_configured("lldb_framework_dir"):
     dotest_cmd += ["--framework", config.lldb_framework_dir]
 
+
 if is_configured("cmake_build_type"):
     dotest_cmd += ["--cmake-build-type", config.cmake_build_type]
 
+# Facebook T92898286
+if is_configured("llvm_test_bolt"):
+    dotest_cmd += ["-E", '"--post-link-optimize"']
+# End Facebook T92898286
+
 if "lldb-simulator-ios" in config.available_features:
     dotest_cmd += ["--apple-sdk", "iphonesimulator", "--platform-name", "ios-simulator"]
 elif "lldb-simulator-watchos" in config.available_features:
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 86d58889cc4ad..20a88044bf74a 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -13,7 +17,7 @@ config.lldb_src_root = "@LLDB_SOURCE_DIR@"
 config.lldb_libs_dir = lit_config.substitute("@LLDB_LIBS_DIR@")
 config.lldb_framework_dir = lit_config.substitute("@LLDB_FRAMEWORK_DIR@")
 config.cmake_cxx_compiler = "@CMAKE_CXX_COMPILER@"
-config.host_os = "@HOST_OS@"
+config.target_os = "@HOST_OS@"
 config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.shared_libs = @LLVM_ENABLE_SHARED_LIBS@
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
@@ -46,6 +50,10 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 # Plugins
 lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@'
 if lldb_build_intel_pt == '1':
diff --git a/lldb/test/API/python_api/type/main.cpp b/lldb/test/API/python_api/type/main.cpp
index 6acde5bb666a6..449f77db0d75e 100644
--- a/lldb/test/API/python_api/type/main.cpp
+++ b/lldb/test/API/python_api/type/main.cpp
@@ -44,7 +44,12 @@ template <unsigned Value> struct PointerInfo {
 };
 
 template <unsigned Value, typename InfoType = PointerInfo<Value>>
-struct Pointer {};
+struct Pointer {
+  // When compiling for Windows with exceptions enabled, this struct
+  // must contain something that takes space and is initialised.
+  // Otherwise it will not be present in the debug information.
+  int pad = 0;
+};
 
 enum EnumType {};
 enum class ScopedEnumType {};
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 0d2774b281710..20a75f4076e42 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -131,7 +131,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("var2 + struct1.foo")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Expressions at breakpoint 2, which is an anonymous block
         self.continue_to_breakpoint(breakpoint_2)
@@ -169,7 +169,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("var2 + struct1.foo")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Expressions at breakpoint 3, which is inside a_function
         self.continue_to_breakpoint(breakpoint_3)
@@ -195,7 +195,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("list + 1")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Now we check that values are updated after stepping
         self.continue_to_breakpoint(breakpoint_4)
diff --git a/lldb/test/Shell/Commands/Inputs/dis_filt.py b/lldb/test/Shell/Commands/Inputs/dis_filt.py
new file mode 100755
index 0000000000000..bac5a36be2f3c
--- /dev/null
+++ b/lldb/test/Shell/Commands/Inputs/dis_filt.py
@@ -0,0 +1,8 @@
+#! /usr/bin/env python3
+
+import sys
+
+for line in sys.stdin:
+    if "0940003f 00200020" in line and "<unknown>" in line:
+        line = line.replace("<unknown>", "Fake64")
+    print(line, end="")
diff --git a/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s b/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s
new file mode 100644
index 0000000000000..78be614e3af15
--- /dev/null
+++ b/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s
@@ -0,0 +1,37 @@
+# REQUIRES: riscv
+# REQUIRES: python
+
+# This test verifies that disassemble -b prints out the correct bytes and
+# format for standard and unknown riscv instructions of various sizes,
+# and that unknown instructions show opcodes and disassemble as "<unknown>".
+# It also tests that the fdis command from examples/python/filter_disasm.py
+# pipes the disassembly output through a simple filter program correctly.
+
+
+# RUN: llvm-mc -filetype=obj -mattr=+c --triple=riscv32-unknown-unknown %s -o %t
+# RUN: %lldb -b %t "-o" "disassemble -b -n main" | FileCheck %s
+# RUN: %lldb -b %t -o "command script import %S/../../../examples/python/filter_disasm.py" -o "fdis set %python %S/Inputs/dis_filt.py" -o "fdis -n main" | FileCheck --check-prefix=FILTER %s
+
+main:
+    addi   sp, sp, -0x20               # 16 bit standard instruction
+    sw     a0, -0xc(s0)                # 32 bit standard instruction
+    .insn 8, 0x2000200940003F;         # 64 bit custom instruction
+    .insn 6, 0x021F | 0x00001000 << 32 # 48 bit xqci.e.li rd=8 imm=0x1000
+    .insn 4, 0x84F940B                 # 32 bit xqci.insbi  
+    .insn 2, 0xB8F2                    # 16 bit cm.push
+
+# CHECK:      [0x0] <+0>:   1101                     addi   sp, sp, -0x20 
+# CHECK-NEXT: [0x2] <+2>:   fea42a23                 sw     a0, -0xc(s0)
+# CHECK-NEXT: [0x6] <+6>:   0940003f 00200020        <unknown>
+# CHECK-NEXT: [0xe] <+14>:  021f 0000 1000           <unknown>
+# CHECK-NEXT: [0x14] <+20>: 084f940b                 <unknown>
+# CHECK-NEXT: [0x18] <+24>: b8f2                     <unknown>
+
+# FILTER: Disassembly filter command (fdis) loaded
+# FILTER:      [0x0] <+0>:   1101                     addi   sp, sp, -0x20 
+# FILTER-NEXT: [0x2] <+2>:   fea42a23                 sw     a0, -0xc(s0)
+# FILTER-NEXT: [0x6] <+6>:   0940003f 00200020        Fake64
+# FILTER-NEXT: [0xe] <+14>:  021f 0000 1000           <unknown>
+# FILTER-NEXT: [0x14] <+20>: 084f940b                 <unknown>
+# FILTER-NEXT: [0x18] <+24>: b8f2                     <unknown>
+
diff --git a/lldb/test/Shell/Commands/command-disassemble-x86-bytes.s b/lldb/test/Shell/Commands/command-disassemble-x86-bytes.s
new file mode 100644
index 0000000000000..fae08d09a0832
--- /dev/null
+++ b/lldb/test/Shell/Commands/command-disassemble-x86-bytes.s
@@ -0,0 +1,28 @@
+# REQUIRES: x86
+
+# This test verifies that disassemble -b prints out the correct bytes and
+# format for x86_64 instructions of various sizes, and that an unknown
+# instruction shows the opcode and disassembles as "<unknown>"
+
+# RUN: llvm-mc -filetype=obj --triple=x86_64-unknown-unknown %s -o %t
+# RUN: %lldb -b %t -o "disassemble -b -n main" | FileCheck %s
+
+main:                                   # @main
+	subq   $0x18, %rsp
+	movl   $0x0, 0x14(%rsp)
+	movq   %rdx, 0x8(%rsp)
+	movl   %ecx, 0x4(%rsp)
+	movl   (%rsp), %eax
+        addq   $0x18, %rsp
+	retq
+        .byte  0x6 
+
+# CHECK: [0x0] <+0>:   48 83 ec 18              subq   $0x18, %rsp
+# CHECK-NEXT: [0x4] <+4>:   c7 44 24 14 00 00 00 00  movl   $0x0, 0x14(%rsp)
+# CHECK-NEXT: [0xc] <+12>:  48 89 54 24 08           movq   %rdx, 0x8(%rsp)
+# CHECK-NEXT: [0x11] <+17>: 89 4c 24 04              movl   %ecx, 0x4(%rsp)
+# CHECK-NEXT: [0x15] <+21>: 8b 04 24                 movl   (%rsp), %eax
+# CHECK-NEXT: [0x18] <+24>: 48 83 c4 18              addq   $0x18, %rsp
+# CHECK-NEXT: [0x1c] <+28>: c3                       retq
+# CHECK-NEXT: [0x1d] <+29>: 06                       <unknown>
+
diff --git a/lldb/test/Shell/Minidump/missing-memory-region.yaml b/lldb/test/Shell/Minidump/missing-memory-region.yaml
new file mode 100644
index 0000000000000..1784cacfaf1ba
--- /dev/null
+++ b/lldb/test/Shell/Minidump/missing-memory-region.yaml
@@ -0,0 +1,42 @@
+# Check that looking up a memory region not present in the Minidump fails
+# even if it's in the /proc/<pid>/maps file.
+
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb -c %t -o "memory read 0x5000" 2>&1 | FileCheck %s
+
+# CHECK-LABEL: (lldb) memory read 0x5000
+# CHECK-NEXT: error: No memory range found for address (0x5000)
+
+--- !minidump
+Streams:
+  - Type:            SystemInfo
+    Processor Arch:  AMD64
+    Processor Level: 6
+    Processor Revision: 15876
+    Number of Processors: 40
+    Platform ID:     Linux
+    CSD Version:     'Linux 3.13.0-91-generic #138-Ubuntu SMP Fri Jun 24 17:00:34 UTC 2016 x86_64'
+    CPU:
+      Vendor ID:       GenuineIntel
+      Version Info:    0x00000000
+      Feature Info:    0x00000000
+  - Type:            LinuxProcStatus
+    Text:             |
+      Name:	test-yaml
+      Umask:	0002
+      State:	t (tracing stop)
+      Pid:	8567
+  - Type:            LinuxMaps
+    Text:             |
+      0x1000-0x1100     r-xp 00000000 00:00 0
+      0x2000-0x2200     rw-p 00000000 00:00 0
+      0x4000-0x6000     rw-- 00000000 00:00 0
+  - Type:            Memory64List
+    Memory Ranges:
+      - Start of Memory Range: 0x1000
+        Data Size:       0x100
+        Content :        ''
+      - Start of Memory Range: 0x2000
+        Data Size:       0x200
+        Content :        ''
+...
diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test
new file mode 100644
index 0000000000000..da6436cb5ca20
--- /dev/null
+++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test
@@ -0,0 +1,68 @@
+# Test that we warn the user about truncated output
+# when target.max-children-count wasn't explicitly set.
+
+# RUN: split-file %s %t
+# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=DWIM
+#
+# RUN: %lldb -x -b -s %t/expr-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=EXPR
+#
+# RUN: %lldb -x -b -s %t/frame-var-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=VAR
+#
+# RUN: %lldb -x -b -s %t/with-setting-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=SETTING
+
+#--- main.cpp
+
+int main() {
+  int arr[512] = { 3 };
+  __builtin_debugtrap();
+}
+
+#--- dwim-commands.input
+
+run
+dwim-print arr
+frame variable arr
+
+DWIM:      (lldb) dwim-print arr
+DWIM:      *** Some of the displayed variables have more members
+DWIM-SAME: use the --show-all-children option to dwim-print
+DWIM:      (lldb) frame variable arr
+DWIM-NOT:  *** Some of the displayed variables have more members
+
+#--- expr-commands.input
+
+run
+expression arr
+frame variable arr
+
+EXPR:      (lldb) expression arr
+EXPR:      *** Some of the displayed variables have more members
+EXPR-SAME: use the --show-all-children option to expression
+EXPR:      (lldb) frame variable arr
+EXPR-NOT:  *** Some of the displayed variables have more members
+
+#--- frame-var-commands.input
+
+run
+frame variable arr
+dwim-print arr
+
+VAR:      (lldb) frame variable arr
+VAR:      *** Some of the displayed variables have more members
+VAR-SAME: use the --show-all-children option to frame variable
+VAR:      (lldb) dwim-print arr
+VAR-NOT:  *** Some of the displayed variables have more members
+
+#--- with-setting-commands.input
+
+run
+settings set target.max-children-count 1
+frame variable arr
+
+SETTING:      (lldb) frame variable arr
+SETTING-NOT:  *** Some of the displayed variables have more members
diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
new file mode 100644
index 0000000000000..12f5661600ae7
--- /dev/null
+++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test
@@ -0,0 +1,84 @@
+# Test that we warn the user about truncated output
+# when target.max-children-depth wasn't explicitly set.
+
+# RUN: split-file %s %t
+# RUN: %clang_host -g %t/main.cpp -o %t.out
+# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=DWIM
+#
+# RUN: %lldb -x -b -s %t/expr-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=EXPR
+#
+# RUN: %lldb -x -b -s %t/frame-var-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=VAR
+#
+# RUN: %lldb -x -b -s %t/with-setting-commands.input %t.out -o exit 2>&1 \
+# RUN:       | FileCheck %s --check-prefix=SETTING
+
+#--- main.cpp
+
+struct L1 {
+    int w;
+    struct L2 {
+        int x;
+        struct L3 {
+            int y;
+            struct L4 {
+              int z;
+                struct L5 {
+                  int a;
+                } l5;
+            } l4;
+        } l3;
+    } l2;
+};
+
+int main() {
+  L1 nested;
+  __builtin_debugtrap();
+}
+
+#--- dwim-commands.input
+
+run
+dwim-print nested
+frame variable nested
+
+DWIM:      (lldb) dwim-print nested
+DWIM:      *** Some of the displayed variables have a greater depth of members
+DWIM-SAME: use the --depth option to dwim-print
+DWIM:      (lldb) frame variable nested
+DWIM-NOT:  *** Some of the displayed variables have a greater depth of members
+
+#--- expr-commands.input
+
+run
+expression nested
+frame variable nested
+
+EXPR:      (lldb) expression nested
+EXPR:      *** Some of the displayed variables have a greater depth of members
+EXPR-SAME: use the --depth option to expression
+EXPR:      (lldb) frame variable nested
+EXPR-NOT:  *** Some of the displayed variables have a greater depth of members
+
+#--- frame-var-commands.input
+
+run
+frame variable nested
+frame variable nested
+
+VAR:      (lldb) frame variable nested
+VAR:      *** Some of the displayed variables have a greater depth of members
+VAR-SAME: use the --depth option to frame variable
+VAR:      (lldb) frame variable nested
+VAR-NOT:  *** Some of the displayed variables have a greater depth of members
+
+#--- with-setting-commands.input
+
+run
+settings set target.max-children-depth 1
+frame variable nested
+
+SETTING:      (lldb) frame variable nested
+SETTING-NOT:  *** Some of the displayed variables have a greater depth of members
diff --git a/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
index d4fcf78d01b81..c29b51219d191 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
+++ b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
@@ -15,7 +15,7 @@
 # RUN: %clang_host -fmodules -Xclang -fmodules-cache-path=%t/cache -I%t -g -gmodules %t/b.m -o %t/b.o -c
 # RUN: %clang_host %t/a.o %t/b.o -o %t/a.out
 # RUN: rm -rf %t/cache
-# RUN: %lldb %t/a.out -o "b main" -o run -o "p a" -o "p b" -o q 2>&1 | FileCheck %s
+# RUN: %lldb %t/a.out -o "b main" -o run -o "expr a" -o "expr b" -o q 2>&1 | FileCheck %s
 # CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist
 # CHECK-NOT: /cache/{{.*}}/C-{.*}.pcm' does not exist
 # CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit
index bbce1e88626e5..301488d5810b3 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit
+++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit
@@ -1,3 +1,4 @@
+settings set target.max-children-depth 10
 expr a
 expr b.c
 expr b.u.c
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 42968128f2702..ac895e8d03ed2 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -241,6 +241,11 @@ def use_support_substitutions(config):
             "-lc++",
         ]
 
+    # Facebook T92898286
+    if config.llvm_test_bolt:
+        host_flags += ["--post-link-optimize"]
+    # End Facebook T92898286
+
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
     config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags))
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index 5be5359217769..7da8d1f49a4a2 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -1,5 +1,10 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -38,6 +43,10 @@ config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-s
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
 config.clang_resource_dir = os.path.join("@CLANG_RESOURCE_DIR@")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index fd89f52595ec6..cbd3b14463e25 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -983,7 +983,7 @@ llvm::Error DAP::Loop() {
 
           if (const protocol::Request *req =
                   std::get_if<protocol::Request>(&*next);
-              req && req->arguments == "disconnect")
+              req && req->command == "disconnect")
             disconnecting = true;
 
           const std::optional<CancelArguments> cancel_args =
diff --git a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
index 8ec5d62a99ac5..64177a2fac490 100644
--- a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
+++ b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
@@ -14,6 +14,7 @@
 #include "lldb/Core/Disassembler.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/StreamString.h"
 
 #include "Plugins/Disassembler/LLVMC/DisassemblerLLVMC.h"
 
@@ -60,12 +61,6 @@ TEST_F(TestMCDisasmInstanceRISCV, TestRISCV32Instruction) {
       arch, nullptr, nullptr, nullptr, nullptr, start_addr, &data, sizeof(data),
       num_of_instructions, false);
 
-  // If we failed to get a disassembler, we can assume it is because
-  // the llvm we linked against was not built with the riscv target,
-  // and we should skip these tests without marking anything as failing.
-  if (!disass_sp)
-    return;
-
   const InstructionList inst_list(disass_sp->GetInstructionList());
   EXPECT_EQ(num_of_instructions, inst_list.GetSize());
 
@@ -90,3 +85,58 @@ TEST_F(TestMCDisasmInstanceRISCV, TestRISCV32Instruction) {
   EXPECT_FALSE(inst_sp->IsCall());
   EXPECT_TRUE(inst_sp->DoesBranch());
 }
+
+TEST_F(TestMCDisasmInstanceRISCV, TestOpcodeBytePrinter) {
+  ArchSpec arch("riscv32-*-linux");
+
+  const unsigned num_of_instructions = 7;
+  // clang-format off
+  uint8_t data[] = {
+      0x41, 0x11,             // addi   sp, sp, -0x10
+      0x06, 0xc6,             // sw     ra, 0xc(sp)
+      0x23, 0x2a, 0xa4, 0xfe, // sw     a0, -0xc(s0)
+      0x23, 0x28, 0xa4, 0xfe, // sw     a0, -0x10(s0)
+      0x22, 0x44,             // lw     s0, 0x8(sp)
+
+      0x3f, 0x00, 0x40, 0x09, // Fake 64-bit instruction
+      0x20, 0x00, 0x20, 0x00,
+
+      0x1f, 0x02,             // 48 bit xqci.e.li rd=8 imm=0x1000
+      0x00, 0x00, 
+      0x00, 0x10,
+  };
+  // clang-format on
+
+  // clang-format off
+  const char *expected_outputs[] = {
+    "1141",
+    "c606",
+    "fea42a23",
+    "fea42823",
+    "4422",
+    "0940003f 00200020",
+    "021f 0000 1000"
+  };
+  // clang-format on
+  const unsigned num_of_expected_outputs =
+      sizeof(expected_outputs) / sizeof(char *);
+
+  EXPECT_EQ(num_of_instructions, num_of_expected_outputs);
+
+  DisassemblerSP disass_sp;
+  Address start_addr(0x100);
+  disass_sp = Disassembler::DisassembleBytes(
+      arch, nullptr, nullptr, nullptr, nullptr, start_addr, &data, sizeof(data),
+      num_of_instructions, false);
+
+  const InstructionList inst_list(disass_sp->GetInstructionList());
+  EXPECT_EQ(num_of_instructions, inst_list.GetSize());
+
+  for (size_t i = 0; i < num_of_instructions; i++) {
+    InstructionSP inst_sp;
+    StreamString s;
+    inst_sp = inst_list.GetInstructionAtIndex(i);
+    inst_sp->GetOpcode().Dump(&s, 1);
+    ASSERT_STREQ(s.GetString().str().c_str(), expected_outputs[i]);
+  }
+}
diff --git a/lldb/unittests/Host/NativeProcessProtocolTest.cpp b/lldb/unittests/Host/NativeProcessProtocolTest.cpp
index a48e67c9213da..91c4fd69d6e54 100644
--- a/lldb/unittests/Host/NativeProcessProtocolTest.cpp
+++ b/lldb/unittests/Host/NativeProcessProtocolTest.cpp
@@ -73,6 +73,97 @@ TEST(NativeProcessProtocolTest, SetBreakpointFailVerify) {
                     llvm::Failed());
 }
 
+TEST(NativeProcessProtocolTest, RemoveSoftwareBreakpoint) {
+  NiceMock<MockDelegate> DummyDelegate;
+  MockProcess<NativeProcessProtocol> Process(DummyDelegate,
+                                             ArchSpec("x86_64-pc-linux"));
+  auto Trap = cantFail(Process.GetSoftwareBreakpointTrapOpcode(1));
+  auto Original = std::vector<uint8_t>{0xbb};
+
+  // Set up a breakpoint.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(Original)));
+    EXPECT_CALL(Process, WriteMemory(0x47, Trap)).WillOnce(Return(ByMove(1)));
+    EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap)));
+    EXPECT_THAT_ERROR(Process.SetBreakpoint(0x47, 0, false).ToError(),
+                      llvm::Succeeded());
+  }
+
+  // Remove the breakpoint for the first time. This should remove the breakpoint
+  // from m_software_breakpoints.
+  //
+  // Should succeed.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap)));
+    EXPECT_CALL(Process, WriteMemory(0x47, llvm::ArrayRef(Original)))
+        .WillOnce(Return(ByMove(1)));
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(Original)));
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Succeeded());
+  }
+
+  // Remove the breakpoint for the second time.
+  //
+  // Should fail. None of the ReadMemory() or WriteMemory() should be called,
+  // because the function should early return when seeing that the breakpoint
+  // isn't in m_software_breakpoints.
+  {
+    EXPECT_CALL(Process, ReadMemory(_, _)).Times(0);
+    EXPECT_CALL(Process, WriteMemory(_, _)).Times(0);
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Failed());
+  }
+}
+
+TEST(NativeProcessProtocolTest, RemoveSoftwareBreakpointMemoryError) {
+  NiceMock<MockDelegate> DummyDelegate;
+  MockProcess<NativeProcessProtocol> Process(DummyDelegate,
+                                             ArchSpec("x86_64-pc-linux"));
+  auto Trap = cantFail(Process.GetSoftwareBreakpointTrapOpcode(1));
+  auto Original = std::vector<uint8_t>{0xbb};
+  auto SomethingElse = std::vector<uint8_t>{0xaa};
+
+  // Set up a breakpoint.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(Original)));
+    EXPECT_CALL(Process, WriteMemory(0x47, Trap)).WillOnce(Return(ByMove(1)));
+    EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap)));
+    EXPECT_THAT_ERROR(Process.SetBreakpoint(0x47, 0, false).ToError(),
+                      llvm::Succeeded());
+  }
+
+  // Remove the breakpoint for the first time, with an unexpected value read by
+  // the first ReadMemory(). This should cause an early return, with the
+  // breakpoint removed from m_software_breakpoints.
+  //
+  // Should fail.
+  {
+    InSequence S;
+    EXPECT_CALL(Process, ReadMemory(0x47, 1))
+        .WillOnce(Return(ByMove(SomethingElse)));
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Failed());
+  }
+
+  // Remove the breakpoint for the second time.
+  //
+  // Should fail. None of the ReadMemory() or WriteMemory() should be called,
+  // because the function should early return when seeing that the breakpoint
+  // isn't in m_software_breakpoints.
+  {
+    EXPECT_CALL(Process, ReadMemory(_, _)).Times(0);
+    EXPECT_CALL(Process, WriteMemory(_, _)).Times(0);
+    EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(),
+                      llvm::Failed());
+  }
+}
+
 TEST(NativeProcessProtocolTest, ReadMemoryWithoutTrap) {
   NiceMock<MockDelegate> DummyDelegate;
   MockProcess<NativeProcessProtocol> Process(DummyDelegate,
@@ -146,4 +237,4 @@ TEST(NativeProcessProtocolTest, ReadCStringFromMemory_CrossPageBoundary) {
                                                      bytes_read),
                        llvm::HasValue(llvm::StringRef("hello")));
   EXPECT_EQ(bytes_read, 6UL);
-}
\ No newline at end of file
+}
diff --git a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
index ee31c8e63644b..44f653c6fa135 100644
--- a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
+++ b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
@@ -308,16 +308,19 @@ TEST_F(MinidumpParserTest, GetMemory) {
 )"),
                     llvm::Succeeded());
 
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54}), parser->GetMemory(0x401d46, 1));
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54, 0x21}),
-            parser->GetMemory(0x401d46, 4));
-
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9}),
-            parser->GetMemory(0x7ffceb34a000, 5));
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04}),
-            parser->GetMemory(0x7ffceb34a000, 3));
-
-  EXPECT_EQ(llvm::ArrayRef<uint8_t>(), parser->GetMemory(0x500000, 512));
+  EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 1),
+                       llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54}));
+  EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 4),
+                       llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54, 0x21}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x7ffceb34a000, 5),
+      llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x7ffceb34a000, 3),
+      llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x500000, 512),
+      llvm::FailedWithMessage("No memory range found for address (0x500000)"));
 }
 
 TEST_F(MinidumpParserTest, FindMemoryRangeWithFullMemoryMinidump) {
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 3f8201fa426fe..8c52806728e5a 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -179,6 +179,13 @@ if ("flang" IN_LIST LLVM_ENABLE_PROJECTS)
   endif ()
 endif()
 
+if ("lldb" IN_LIST LLVM_ENABLE_PROJECTS)
+  if (NOT "clang" IN_LIST LLVM_ENABLE_PROJECTS)
+    message(STATUS "Enabling clang as a dependency of lldb")
+    list(APPEND LLVM_ENABLE_PROJECTS "clang")
+  endif()
+endif ()
+
 if ("libc" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=libc or see the instructions at "
@@ -779,6 +786,10 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF)
 
+# Facebook T92898286
+option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF)
+# End Facebook T92898286
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX"))
diff --git a/llvm/clang/test/Modules/implicit-opt-level.c b/llvm/clang/test/Modules/implicit-opt-level.c
new file mode 100644
index 0000000000000..f6f1f58d31d79
--- /dev/null
+++ b/llvm/clang/test/Modules/implicit-opt-level.c
@@ -0,0 +1,15 @@
+// This test checks that under implicit modules, different optimization levels
+// get different context hashes.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+//--- module.modulemap
+module M { header "M.h" }
+//--- M.h
+//--- tu.c
+#include "M.h"
+
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -O0 -fsyntax-only %t/tu.c
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -O1 -fsyntax-only %t/tu.c
+// RUN: find %t/cache -name "M-*.pcm" | count 2
diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 88fc1fd326b76..f85f32a1fdd51 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -27,6 +27,7 @@ Basic Commands
    llvm-dis
    llvm-dwarfdump
    llvm-dwarfutil
+   llvm-ir2vec
    llvm-lib
    llvm-libtool-darwin
    llvm-link
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst b/llvm/docs/CommandGuide/llvm-ir2vec.rst
new file mode 100644
index 0000000000000..13fe4996b968f
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -0,0 +1,170 @@
+llvm-ir2vec - IR2Vec Embedding Generation Tool
+==============================================
+
+.. program:: llvm-ir2vec
+
+SYNOPSIS
+--------
+
+:program:`llvm-ir2vec` [*options*] *input-file*
+
+DESCRIPTION
+-----------
+
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
+generates IR2Vec embeddings for LLVM IR and supports triplet generation 
+for vocabulary training. It provides two main operation modes:
+
+1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary
+   training from LLVM IR.
+
+2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary
+   at different granularity levels (instruction, basic block, or function).
+
+The tool is designed to facilitate machine learning applications that work with
+LLVM IR by converting the IR into numerical representations that can be used by
+ML models.
+
+.. note::
+
+   For information about using IR2Vec programmatically within LLVM passes and 
+   the C++ API, see the `IR2Vec Embeddings <https://llvm.org/docs/MLGO.html#ir2vec-embeddings>`_ 
+   section in the MLGO documentation.
+
+OPERATION MODES
+---------------
+
+Triplet Generation Mode
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets
+consisting of opcodes, types, and operands. These triplets can be used to train
+vocabularies for embedding generation.
+
+Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+
+Embedding Generation Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
+generate numerical embeddings for LLVM IR at different levels of granularity.
+
+Example Usage:
+
+.. code-block:: bash
+
+   llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func input.bc -o embeddings.txt
+
+OPTIONS
+-------
+
+.. option:: --mode=<mode>
+
+ Specify the operation mode. Valid values are:
+
+ * ``triplets`` - Generate triplets for vocabulary training
+ * ``embeddings`` - Generate embeddings using trained vocabulary (default)
+
+.. option:: --level=<level>
+
+ Specify the embedding generation level. Valid values are:
+
+ * ``inst`` - Generate instruction-level embeddings
+ * ``bb`` - Generate basic block-level embeddings  
+ * ``func`` - Generate function-level embeddings (default)
+
+.. option:: --function=<name>
+
+ Process only the specified function instead of all functions in the module.
+
+.. option:: --ir2vec-vocab-path=<path>
+
+ Specify the path to the vocabulary file (required for embedding mode).
+ The vocabulary file should be in JSON format and contain the trained
+ vocabulary for embedding generation. See `llvm/lib/Analysis/models`
+ for pre-trained vocabulary files.
+
+.. option:: --ir2vec-opc-weight=<weight>
+
+ Specify the weight for opcode embeddings (default: 1.0). This controls
+ the relative importance of instruction opcodes in the final embedding.
+
+.. option:: --ir2vec-type-weight=<weight>
+
+ Specify the weight for type embeddings (default: 0.5). This controls
+ the relative importance of type information in the final embedding.
+
+.. option:: --ir2vec-arg-weight=<weight>
+
+ Specify the weight for argument embeddings (default: 0.2). This controls
+ the relative importance of operand information in the final embedding.
+
+.. option:: -o <filename>
+
+ Specify the output filename. Use ``-`` to write to standard output (default).
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. note::
+
+   ``--level``, ``--function``, ``--ir2vec-vocab-path``, ``--ir2vec-opc-weight``, 
+   ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in embedding 
+   mode. These options are ignored in triplet mode.
+
+INPUT FILE FORMAT
+-----------------
+
+:program:`llvm-ir2vec` accepts LLVM bitcode files (``.bc``) and LLVM IR files 
+(``.ll``) as input. The input file should contain valid LLVM IR.
+
+OUTPUT FORMAT
+-------------
+
+Triplet Mode Output
+~~~~~~~~~~~~~~~~~~~
+
+In triplet mode, the output consists of lines containing space-separated triplets:
+
+.. code-block:: text
+
+   <opcode> <type> <operand1> <operand2> ...
+
+Each line represents the information of one instruction, with the opcode, type,
+and operands.
+
+Embedding Mode Output
+~~~~~~~~~~~~~~~~~~~~~
+
+In embedding mode, the output format depends on the specified level:
+
+* **Function Level**: One embedding vector per function
+* **Basic Block Level**: One embedding vector per basic block, grouped by function
+* **Instruction Level**: One embedding vector per instruction, grouped by basic block and function
+
+Each embedding is represented as a floating point vector.
+
+EXIT STATUS
+-----------
+
+:program:`llvm-ir2vec` returns 0 on success, and a non-zero value on failure.
+
+Common failure cases include:
+
+* Invalid or missing input file
+* Missing or invalid vocabulary file (in embedding mode)
+* Specified function not found in the module
+* Invalid command line options
+
+SEE ALSO
+--------
+
+:doc:`../MLGO`
+
+For more information about the IR2Vec algorithm and approach, see:
+`IR2Vec: LLVM IR Based Scalable Program Embeddings <https://doi.org/10.1145/3418463>`_.
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index dc53072e09e39..d87a8bd81cc7b 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -354,11 +354,6 @@ The :doc:`CodeOfConduct` applies to all office hours.
     - Every first Friday of the month, 14:00 UK time, for 60 minutes.
     - `Google meet <https://meet.google.com/jps-twgq-ivz>`__
     - English, Portuguese
-  * - Rotating hosts
-    - Getting Started, beginner questions, new contributors.
-    - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
-    - `Google meet <https://meet.google.com/nga-uhpf-bbb>`__
-    - English
 
 For event owners, our Discord bot also supports sending automated announcements
 of upcoming office hours. Please see the :ref:`discord-bot-event-pings` section
diff --git a/llvm/docs/HowToSubmitABug.rst b/llvm/docs/HowToSubmitABug.rst
index 1935678b1da33..002087cc55e0a 100644
--- a/llvm/docs/HowToSubmitABug.rst
+++ b/llvm/docs/HowToSubmitABug.rst
@@ -112,15 +112,20 @@ If this does crash, then you should be able to debug this with the following
 Run this, then file a bug with the instructions and reduced .bc
 files that bugpoint emits.
 
-If bugpoint doesn't reproduce the crash, ``llvm-reduce`` is an alternative
-way to reduce LLVM IR. Create a script that repros the crash and run:
+If bugpoint doesn't reproduce the crash,
+:doc:`llvm-reduce <CommandGuide/llvm-reduce>` is an alternative way to reduce
+LLVM IR. Create a script that repros the crash and run:
 
 .. code-block:: bash
 
    llvm-reduce --test=path/to/script foo.bc
 
-which should produce reduced IR that reproduces the crash. Be warned the
-``llvm-reduce`` is still fairly immature and may crash.
+which should produce reduced IR that reproduces the crash.
+
+.. TIP::
+   ``llvm-reduce`` is still fairly immature and may crash. On the other hand,
+   unlike ``bugpoint``, ``llvm-reduce -j $NUM_THREADS`` is multi-threaded and
+   can therefore potentially be much faster.
 
 If none of the above work, you can get the IR before a crash by running the
 ``opt`` command with the ``--print-before-all --print-module-scope`` flags to
diff --git a/llvm/docs/HowToUpdateDebugInfo.rst b/llvm/docs/HowToUpdateDebugInfo.rst
index c3262a96b62e4..915e2896023c5 100644
--- a/llvm/docs/HowToUpdateDebugInfo.rst
+++ b/llvm/docs/HowToUpdateDebugInfo.rst
@@ -169,6 +169,14 @@ See the discussion in the section about
 :ref:`merging locations<WhenToMergeLocation>` for examples of when the rule for
 dropping locations applies.
 
+When to remap a debug location
+------------------------------
+
+When code paths are duplicated, during passes such as loop unrolling or jump
+threading, `DILocation` attachments need to be remapped using `mapAtomInstance`
+and `RemapSourceAtom`. This is to support the Key Instructions debug info feature.
+See :doc:`KeyInstructionsDebugInfo` for information.
+
 .. _NewInstLocations:
 
 Setting locations for new instructions
@@ -496,7 +504,7 @@ as follows:
 
 .. code-block:: bash
 
-  $ llvm-original-di-preservation.py sample.json sample.html
+  $ llvm-original-di-preservation.py sample.json --report-file sample.html
 
 Testing of original debug info preservation can be invoked from front-end level
 as follows:
diff --git a/llvm/docs/KeyInstructionsDebugInfo.md b/llvm/docs/KeyInstructionsDebugInfo.md
new file mode 100644
index 0000000000000..305740554c0fe
--- /dev/null
+++ b/llvm/docs/KeyInstructionsDebugInfo.md
@@ -0,0 +1,160 @@
+# Key Instructions debug info in LLVM and Clang
+
+Key Instructions is an LLVM feature that reduces the jumpiness of optimized code debug stepping by distinguishing the significance of instructions that make up source language statements. This document explains the feature and how it is implemented in [LLVM](#llvm) and [Clang](#clang-and-other-front-ends).
+
+## Status
+
+Feature complete except for coroutines, which fall back to not-key-instructions handling for now but will get support soon (there is no fundamental reason why they cannot be supported, we've just not got to it at time of writing).
+
+Tell Clang [not] to produce Key Instructions metadata with `-g[no-]key-instructions`.
+
+The feature improves optimized code stepping; it's intended for the feature to be used with optimisations enabled. Although the feature works at O0 it is not recommended because in some cases the effect of editing variables may not always be immediately realised. In some cases, debuggers may place a breakpoint after parts of an expression have been evaluated, which limits the ability to have variable edits affect expressions. (This is a quirk of the current implementation, rather than fundamental limitation, covered in more detail [later](#disabled-at-o0).)
+
+This is a DWARF-based feature. There is currently no plan to support CodeView.
+
+Set LLVM flag `-dwarf-use-key-instructions` to `false` to ignore Key Instructions metadata when emitting DWARF.
+
+# LLVM
+
+## Problem statement
+
+A lot of the noise in stepping comes from code motion and instruction scheduling. Consider a long expression on a single line. It may involve multiple operations that optimisations move, re-order, and interleave with other instructions that have different line numbers.
+
+DWARF provides a helpful tool the compiler can employ to mitigate this jumpiness, the `is_stmt` flag, which indicates that an instruction is a recommended breakpoint location. However, LLVM's current approach to deciding `is_stmt` placement essentially reduces down to "is the associated line number different to the previous instruction's?".
+
+(Note: It's up to the debugger if it wants to interpret `is_stmt` or not, and at time of writing LLDB doesn't; possibly because until now LLVM's `is_stmt`s convey no information that can't already be deduced from the rest of the line table.)
+
+## Solution overview
+
+Taking ideas from two papers [1][2] that explore the issue, especially C. Tice's:
+
+From the perspective of a source-level debugger user:
+
+* Source code is made up of interesting constructs; the level of granularity for “interesting” while stepping is typically assignments, calls, control flow. We’ll call these interesting constructs Atoms.
+
+* Atoms usually have one instruction that implements the functionality that a user can observe; once they step “off” that instruction, the atom is finalised. We’ll call that a Key Instruction.
+
+* Communicating where the key instructions are to the debugger (using DWARF’s is_stmt) avoids jumpiness introduced by scheduling non-key instructions without losing source attribution (because non-key instructions retain an associated source location, they’re just ignored for stepping).
+
+## Solution implementation
+
+1. `DILocation` has 2 new fields, `atomGroup` and `atomRank`. `DISubprogram` has a new field `keyInstructions`.
+2. Clang creates `DILocations` using the new fields to communicate which instructions are "interesting", and sets `keyInstructions` true in `DISubprogram`s to tell LLVM to interpret the new metadata in those functions.
+3. There’s some bookkeeping required by optimisations that duplicate control flow.
+4. During DWARF emission, the new metadata is collected (linear scan over instructions) to decide `is_stmt` placements.
+
+Details:
+
+1. *The metadata* - The two new `DILocation` fields are `atomGroup` and `atomRank` and are both are unsigned integers. `atomGroup` is 61 bits and `atomRank` 3 bits. Instructions in the same function with the same `(atomGroup, inlinedAt)` pair are part of the same source atom. `atomRank` determines `is_stmt` preference within that group, where a lower number is higher precedence. Higher rank instructions act as "backup" `is_stmt` locations, providing good fallback locations if/when the primary candidate gets optimized away. The default values of 0 indicate the instruction isn’t interesting - it's not an `is_stmt` candidate. If `keyInstructions` in `DISubprogram` is false (default) then the new `DILocation` metadata is ignored for the function (including inlined instances) when emitting DWARF.
+
+2. *Clang annotates key instructions* with the new metadata. Variable assignments (stores, memory intrinsics), control flow (branches and their conditions, some unconditional branches), and exception handling instructions are annotated. Calls are ignored as they're unconditionally marked `is_stmt`.
+
+3. *Throughout optimisation*, the `DILocation` is propagated normally. Cloned instructions get the original’s `DILocation`, the new fields get merged in `getMergedLocation`, etc. However, pass writers need to intercede in cases where a code path is duplicated, e.g. unrolling, jump-threading. In these cases we want to emit key instructions in both the original and duplicated code, so the duplicated must be assigned new `atomGroup` numbers, in a similar way that instruction operands must get remapped. There are facilities to help this: `mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap)` adds an entry to `VMap` which can later be used for remapping using `llvm::RemapSourceAtom(Instruction *I, ValueToValueMapTy &VM)`. `mapAtomInstance` is called from `llvm::CloneBasicBlock` and `llvm::RemapSourceAtom` is called from `llvm::RemapInstruction` so in many cases no additional work is actually needed. `mapAtomInstance` ensures `LLVMContextImpl::NextAtomGroup` is kept up to date, which is the global “next available atom number”.
+The `DILocations` carry over from IR to MIR as normal, without any changes.
+
+4. *DWARF emission* - Iterate over all instructions in a function. For each `(atomGroup, inlinedAt)` pair we find the set of instructions sharing the lowest rank. Only the last of these instructions in each basic block is included in the set. The instructions in this set get `is_stmt` applied to their source locations. That `is_stmt` then "floats" to the top of contiguous sequence of instructions with the same line number in the same basic block. That has two benefits when optimisations are enabled. First, this floats `is_stmt` to the top of epilogue instructions (rather than applying it to the `ret` instruction itself) which is important to avoid losing variable location coverage at return statements. Second, it reduces the difference in optimized code stepping behaviour between when Key Instructions is enabled and disabled in “uninteresting” cases. I.e., it appears to generally reduce unnecessary changes in stepping.\
+We’ve used contiguous line numbers rather than atom membership as the test there because of our choice to represent source atoms with a single integer ID. We can’t have instructions belonging to multiple atom groups or represent any kind of grouping hierarchy. That means we can’t rely on all the call setup instructions being in the same group currently (e.g., if one of the argument expressions contains key functionality such as a store, it will be in its own group).
+
+## Limitations
+
+### Lack of multiple atom membership
+
+Using a number to represent atom membership is limiting; currently an instruction that belongs to multiple source atoms cannot belong to multiple atom groups. This does occur in practice, both in the front end and during optimisations. Consider this C code:
+```c
+a = b = c;
+```
+Clang generates this IR:
+```llvm
+  %0 = load i32, ptr %c.addr, align 4
+  store i32 %0, ptr %b.addr, align 4
+  store i32 %0, ptr %a.addr, align 4
+```
+The load of `c` is used by both stores (which are the Key Instructions for each assignment respectively). We can only use it as a backup location for one of the two atoms.
+
+Certain optimisations merge source locations, which presents another case where it might make sense to be able to represent an instruction belonging to multiple atoms. Currently we deterministically pick one (choosing to keep the lower rank one if there is one).
+
+### Disabled at O0
+
+Consider the following code without optimisations:
+```c
+int c =
+    a + b;
+```
+In the current implementation an `is_stmt` won't be generated for the `a + b` instruction, meaning debuggers will likely step over the `add` and stop at the `store` of the result into `c` (which does get `is_stmt`). A user might have wished to edit `a` or `b` on the previous line in order to alter the result stored to `c`, which they now won't have the chance to do (they'd need to edit the variables on a previous line instead). If the expression was all on one line then they would be able to edit the values before the `add`. For these reasons we're choosing to recommend that the feature should not be enabled at O0.
+
+It should be possible to fix this case if we make a few changes: add all the instructions in the statement (i.e., including the loads) to the atom, and tweak the DwarfEmission code to understand this situation (same atom, different line). So there is room to persue this in the future. Though that gets tricky in some cases due to the [other limitation mentioned above](#lack-of-multiple-atom-membership), e.g.:
+```c
+int e =        // atom 1
+    (a + b)    // atom 1
+  * (c = d);   // - atom 2
+```
+```llvm
+  %0 = load i32, ptr %a.addr, align 4     ; atom 1
+  %1 = load i32, ptr %b.addr, align 4     ; atom 1
+  %add = add nsw i32 %0, %1               ; atom 1
+  %2 = load i32, ptr %d.addr, align 4     ; - atom 2
+  store i32 %2, ptr %c.addr, align 4      ; - atom 2
+  %mul = mul nsw i32 %add, %2             ; atom 1
+  store i32 %mul, ptr %e, align 4         ; atom 1
+```
+Without multiple-atom-membership or some kind of atom hierarchy it's not apparent how to get the `is_stmt` to stick to `a + b`, given the other rules the `is_stmt` placement follows.
+
+O0 isn't a key use-case so solving this is not a priority for the initial implementation. The trade off, smoother stepping at the cost of not being able to edit variables to affect an expression in some cases (and at particular stop points), becomes more attractive when optimisations are enabled (we find that editing variables in the debugger in optimized code often produces unexpected effects, so it's not a big concern that Key Instructions makes it harder sometimes).
+
+# Clang and other front ends
+
+Tell Clang [not] to produce Key Instructions metadata with `-g[no-]key-instructions`.
+
+## Implementation
+
+Clang needs to annotate key instructions with the new metadata. Variable assignments (stores, memory intrinsics), control flow (branches and their conditions, some unconditional branches), and exception handling instructions are annotated. Calls are ignored as they're unconditionally marked `is_stmt`. This is achieved with a few simple constructs:
+
+Class `ApplyAtomGroup` - This is a scoped helper similar to `ApplyDebugLocation` that creates a new source atom group which instructions can be added to. It's used during CodeGen to declare that a new source atom has started, e.g. in `CodeGenFunction::EmitBinaryOperatorLValue`.
+
+`CodeGenFunction::addInstToCurrentSourceAtom(llvm::Instruction *KeyInstruction, llvm::Value *Backup)` adds an instruction (and a backup instruction if non-null) to the current "atom group" defined with `ApplyAtomGroup`. The Key Instruction gets rank 1, and backup instructions get higher ranks (the function looks through casts, applying increasing rank as it goes). There are a lot of sites in Clang that need to call this (mostly stores and store-like instructions). Most stores created through `CGBuilderTy` are annotated, but some that don't need to be key are not. It's important to remember that if there's no active atom group, i.e. no active `ApplyAtomGroup` instance, then `addInstToCurrentSourceAtom` does not annotate the instructions.
+
+`CodeGenFunction::addInstToNewSourceAtom(llvm::Instruction *KeyInstruction, llvm::Value *Backup)` adds an instruction (and a backup instruction if non-null) to a new "atom group". Currently mostly used in loop handling code.
+
+`CodeGenFunction::addInstToSpecificSourceAtom(llvm::Instruction *KeyInstruction, llvm::Value *Backup, uint64_t Atom)` adds the instruction (and backup instruction if non-null) to the specific group `Atom`. This is currently only used for `rets` which is explored in the examples below. Special handling is needed due to the fact that an existing atom group needs to be reused in some circumstances, so neither of the other helper functions are appropriate.
+
+## Examples
+
+A simple example walk through:
+```c
+void fun(int a) {
+  int b = a;
+}
+```
+
+There are two key instructions here, the assignment and the implicit return. We want to emit metadata that looks like this:
+
+```llvm
+define hidden void @_Z3funi(i32 noundef %a) #0 !dbg !11 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4, !dbg !DILocation(line: 2, scope: !11, atomGroup: 1, atomRank: 2)
+  store i32 %0, ptr %b, align 4,       !dbg !DILocation(line: 2, scope: !11, atomGroup: 1, atomRank: 1)
+  ret void,                            !dbg !DILocation(line: 3, scope: !11, atomGroup: 2, atomRank: 1)
+}
+```
+
+The store is the key instruction for the assignment (`atomGroup` 1). The instruction corresponding to the final (and in this case only) RHS value, the load from `%a.addr`, is a good backup location for `is_stmt` if the store gets optimized away. It's part of the same source atom, but has lower `is_stmt` precedence, so it gets a higher `atomRank`. This is achieved by starting an atom group with `ApplyAtomGroup` for the source atom (in this case a variable init) in `EmitAutoVarInit`. The instructions (both key and backup) are then annotated by call to `addInstToCurrentSourceAtom` called from `EmitStoreOfScalar`.
+
+The implicit return is also key (`atomGroup` 2) so that it's stepped on, to match existing non-key-instructions behaviour. This is achieved by calling  `addInstToNewSourceAtom` from within `EmitFunctionEpilog`.
+
+Explicit return statements are handled uniquely. Rather than emit a `ret` for each `return` Clang, in all but the simplest cases (as in the first example) emits a branch to a dedicated block with a single `ret`. That branch is the key instruction for the return statement. If there's only one branch to that block, because there's only one `return` (as in this example), Clang folds the block into its only predecessor. Handily `EmitReturnBlock` returns the `DebugLoc` associated with the single branch in that case, which is fed into `addInstToSpecificSourceAtom` to ensure the `ret` gets the right group.
+
+
+## Supporting Key Instructions from another front end
+
+Front ends that want to use the feature need to group and rank instructions according to their source atoms and interingness by attaching `DILocations` with the necessary `atomGroup` and `atomRank` values. They also need to set the `keyInstructions` field to `true` in `DISubprogram`s to tell LLVM to interpret the new metadata in those functions.
+
+The prototype had LLVM annotate instructions (instead of Clang) using simple heuristics (just looking at kind of instructions, e.g., annotating all stores, conditional branches, etc). This doesn't exist anywhere upstream, but could be shared if there's interest (e.g., so another front end can try it out before committing to a full implementation), feel free to reach out on Discourse (@OCHyams, @jmorse).
+
+---
+
+**References**
+* [1] Key Instructions: Solving the Code Location Problem for Optimized Code (C. Tice, S. L. Graham, 2000)
+* [2] Debugging Optimized Code: Concepts and Implementation on DIGITAL Alpha Systems (R. F. Brender et al)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8ea850af7a69b..371f356c80b0a 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3240,12 +3240,24 @@ as follows:
     as :ref:`Non-Integral Pointer Type <nointptrtype>` s.  The ``0``
     address space cannot be specified as non-integral.
 
-Unless explicitly stated otherwise, on every specification that specifies
-an alignment, the value of the alignment must be in the range [1,2^16)
-and must be a power of two times the width of a byte.
-On every specification that takes a ``<abi>:<pref>``, specifying the
-``<pref>`` alignment is optional. If omitted, the preceding ``:``
-should be omitted too and ``<pref>`` will be equal to ``<abi>``.
+``<abi>`` is a lower bound on what is required for a type to be considered
+aligned. This is used in various places, such as:
+
+- The alignment for loads and stores if none is explicitly given.
+- The alignment used to compute struct layout.
+- The alignment used to compute allocation sizes and thus ``getelementptr``
+  offsets.
+- The alignment below which accesses are considered underaligned.
+
+``<pref>`` allows providing a more optimal alignment that should be used when
+possible, primarily for ``alloca`` and the alignment of global variables. It is
+an optional value that must be greater than or equal to ``<abi>``. If omitted,
+the preceding ``:`` should also be omitted and ``<pref>`` will be equal to
+``<abi>``.
+
+Unless explicitly stated otherwise, every alignment specification is provided in
+bits and must be in the range [1,2^16). The value must be a power of two times
+the width of a byte (i.e. ``align = 8 * 2^N``).
 
 When constructing the data layout for a given target, LLVM starts with a
 default set of specifications which are then (possibly) overridden by
@@ -4855,7 +4867,7 @@ to be eliminated. This is because '``poison``' is stronger than '``undef``'.
 
       %D = undef
       %E = icmp slt %D, 4
-      %F = icmp gte %D, 4
+      %F = icmp sge %D, 4
 
     Safe:
       %A = undef
@@ -28302,9 +28314,9 @@ Overview:
 
 The '``llvm.experimental.constrained.lrint``' intrinsic returns the first
 argument rounded to the nearest integer. An inexact floating-point exception
-will be raised if the argument is not an integer. An invalid exception is
-raised if the result is too large to fit into a supported integer type,
-and in this case the result is undefined.
+will be raised if the argument is not an integer. If the rounded value is too
+large to fit into the result type, an invalid exception is raised, and the
+return value is a non-deterministic value (equivalent to `freeze poison`).
 
 Arguments:
 """"""""""
@@ -28350,9 +28362,9 @@ Overview:
 
 The '``llvm.experimental.constrained.llrint``' intrinsic returns the first
 argument rounded to the nearest integer. An inexact floating-point exception
-will be raised if the argument is not an integer. An invalid exception is
-raised if the result is too large to fit into a supported integer type,
-and in this case the result is undefined.
+will be raised if the argument is not an integer. If the rounded value is too
+large to fit into the result type, an invalid exception is raised, and the
+return value is a non-deterministic value (equivalent to `freeze poison`).
 
 Arguments:
 """"""""""
@@ -28701,8 +28713,9 @@ Overview:
 The '``llvm.experimental.constrained.lround``' intrinsic returns the first
 argument rounded to the nearest integer with ties away from zero.  It will
 raise an inexact floating-point exception if the argument is not an integer.
-An invalid exception is raised if the result is too large to fit into a
-supported integer type, and in this case the result is undefined.
+If the rounded value is too large to fit into the result type, an invalid
+exception is raised, and the return value is a non-deterministic value
+(equivalent to `freeze poison`).
 
 Arguments:
 """"""""""
@@ -28739,8 +28752,9 @@ Overview:
 The '``llvm.experimental.constrained.llround``' intrinsic returns the first
 argument rounded to the nearest integer with ties away from zero. It will
 raise an inexact floating-point exception if the argument is not an integer.
-An invalid exception is raised if the result is too large to fit into a
-supported integer type, and in this case the result is undefined.
+If the rounded value is too large to fit into the result type, an invalid
+exception is raised, and the return value is a non-deterministic value
+(equivalent to `freeze poison`).
 
 Arguments:
 """"""""""
diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index ed0769bebeac3..965a21b8c84b8 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -468,6 +468,13 @@ The core components are:
 Using IR2Vec
 ------------
 
+.. note::
+
+   This section describes how to use IR2Vec within LLVM passes. A standalone 
+   tool :doc:`CommandGuide/llvm-ir2vec` is available for generating the
+   embeddings and triplets from LLVM IR files, which can be useful for
+   training vocabularies and generating embeddings outside of compiler passes.
+
 For generating embeddings, first the vocabulary should be obtained. Then, the 
 embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance.
 
@@ -524,6 +531,10 @@ Further Details
 For more detailed information about the IR2Vec algorithm, its parameters, and
 advanced usage, please refer to the original paper:
 `IR2Vec: LLVM IR Based Scalable Program Embeddings <https://doi.org/10.1145/3418463>`_.
+
+For information about using IR2Vec tool for generating embeddings and
+triplets from LLVM IR, see :doc:`CommandGuide/llvm-ir2vec`.
+
 The LLVM source code for ``IR2Vec`` can also be explored to understand the 
 implementation details.
 
@@ -595,4 +606,3 @@ optimizations that are currently MLGO-enabled, it may be used as follows:
 where the ``name`` is a path fragment. We will expect to find 2 files,
 ``<name>.in`` (readable, data incoming from the managing process) and
 ``<name>.out`` (writable, the model runner sends data to the managing process)
-
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 158a20cce7f85..9f6ac558b6f7c 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -516,6 +516,9 @@ The current vendor extensions supported are:
 ``XAndesPerf``
   LLVM implements `version 5.0.0 of the Andes Performance Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
+``XAndesBFHCvt``
+  LLVM implements `version 5.0.0 of the Andes Scalar BFLOAT16 Conversion Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
+
 ``XAndesVBFHCvt``
   LLVM implements `version 5.0.0 of the Andes Vector BFLOAT16 Conversion Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index daf822388a2ff..5591ac619c399 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -145,6 +145,10 @@ Changes to the LoongArch Backend
 
 * Changing the default code model from `small` to `medium` for 64-bit.
 * Added inline asm support for the `q` constraint.
+* Added the `32s` target feature for LA32S ISA extensions.
+* Added codegen support for atomic-ops (`cmpxchg`, `max`, `min`, `umax`, `umin`) on LA32.
+* Added codegen support for the ILP32D calling convention.
+* Added several codegen and vectorization optimizations.
 
 Changes to the MIPS Backend
 ---------------------------
@@ -214,6 +218,7 @@ Changes to the RISC-V Backend
 * `-mcpu=andes-ax45mpv` was added.
 * Removed -mattr=+no-rvc-hints that could be used to disable parsing and generation of RVC hints.
 * Adds assembler support for the Andes `XAndesvsintload` (Andes Vector INT4 Load extension).
+* Adds assembler support for the Andes `XAndesbfhcvt` (Andes Scalar BFLOAT16 Conversion extension).
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -228,6 +233,8 @@ Changes to the X86 Backend
 --------------------------
 
 * `fp128` will now use `*f128` libcalls on 32-bit GNU targets as well.
+* On x86-32, `fp128` and `i128` are now passed with the expected 16-byte stack
+  alignment.
 
 Changes to the OCaml bindings
 -----------------------------
@@ -306,6 +313,16 @@ Changes to LLDB
     stop reason = SIGSEGV: sent by tkill system call (sender pid=649752, uid=2667987)
   ```
 * ELF Cores can now have their siginfo structures inspected using `thread siginfo`.
+* LLDB now uses
+  [DIL](https://discourse.llvm.org/t/rfc-data-inspection-language/69893) as the
+  default implementation for 'frame variable'. This should not change the
+  behavior of 'frame variable' at all, at this time. To revert to using the
+  old implementation use: `settings set target.experimental.use-DIL false`.
+* Disassembly of unknown instructions now produces `<unknown>` instead of
+  nothing at all
+* Changed the format of opcode bytes to match llvm-objdump when disassembling
+  RISC-V code with `disassemble`'s `--byte` option.
+
 
 ### Changes to lldb-dap
 
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 8f04b6594de79..5cb8d04c0da2a 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -157,6 +157,7 @@ Members of the LLVM Security Response Group are expected to:
 * Help write and review patches to address security issues.
 * Participate in the member nomination and removal processes.
 
+.. _security-group-discussion-medium:
 
 Discussion Medium
 =================
@@ -204,6 +205,10 @@ The LLVM Security Policy may be changed by majority vote of the LLVM Security Re
 What is considered a security issue?
 ====================================
 
+We define "security-sensitive" to mean that a discovered bug or vulnerability
+may require coordinated disclosure, and therefore should be reported to the LLVM
+Security Response group rather than publishing in the public bug tracker.
+
 The LLVM Project has a significant amount of code, and not all of it is
 considered security-sensitive. This is particularly true because LLVM is used in
 a wide variety of circumstances: there are different threat models, untrusted
@@ -217,31 +222,52 @@ security-sensitive). This requires a rationale, and buy-in from the LLVM
 community as for any RFC. In some cases, parts of the codebase could be handled
 as security-sensitive but need significant work to get to the stage where that's
 manageable. The LLVM community will need to decide whether it wants to invest in
-making these parts of the code securable, and maintain these security
-properties over time. In all cases the LLVM Security Response Group should be consulted,
-since they'll be responding to security issues filed against these parts of the
-codebase.
-
-If you're not sure whether an issue is in-scope for this security process or
-not, err towards assuming that it is. The Security Response Group might agree or disagree
-and will explain its rationale in the report, as well as update this document
-through the above process.
-
-The security-sensitive parts of the LLVM Project currently are the following.
-Note that this list can change over time.
-
-* None are currently defined. Please don't let this stop you from reporting
-  issues to the LLVM Security Response Group that you believe are security-sensitive.
-
-The parts of the LLVM Project which are currently treated as non-security
-sensitive are the following. Note that this list can change over time.
-
-* Language front-ends, such as clang, for which a malicious input file can cause
-  undesirable behavior. For example, a maliciously crafted C or Rust source file
-  can cause arbitrary code to execute in LLVM. These parts of LLVM haven't been
-  hardened, and compiling untrusted code usually also includes running utilities
-  such as `make` which can more readily perform malicious things.
-
+making these parts of the code securable, and maintain these security properties
+over time. In all cases the LLVM Security Response Group
+`should be consulted <security-group-discussion-medium_>`__, since they'll be
+responding to security issues filed against these parts of the codebase.
+
+The security-sensitive parts of the LLVM Project currently are the following:
+
+* Code generation: most miscompilations are not security sensitive. However, a
+  miscompilation where there are clear indications that it can result in the
+  produced binary becoming significantly easier to exploit could be considered
+  security sensitive, and should be reported to the security response group.
+* Run-time libraries: only parts of the run-time libraries are considered
+  security-sensitive. The parts that are not considered security-sensitive are
+  documented below.
+
+The following parts of the LLVM Project are currently treated as non-security
+sensitive:
+
+* LLVM's language frontends, analyzers, optimizers, and code generators for
+  which a malicious input can cause undesirable behavior. For example, a
+  maliciously crafted C, Rust or bitcode input file can cause arbitrary code to
+  execute in LLVM. These parts of LLVM haven't been hardened, and handling
+  untrusted code usually also includes running utilities such as make which can
+  more readily perform malicious things. For example, vulnerabilities in clang,
+  clangd, or the LLVM optimizer in a JIT caused by untrusted inputs are not
+  security-sensitive.
+* The following parts of the run-time libraries are explicitly not considered
+  security-sensitive:
+
+  * parts of the run-time libraries that are not meant to be included in
+    production binaries. For example, most sanitizers are not considered
+    security-sensitive as they are meant to be used during development only, not
+    in production.
+  * for libc and libc++: if a user calls library functionality in an undefined
+    or otherwise incorrect way, this will most likely not be considered a
+    security issue, unless the libc/libc++ documentation explicitly promises to
+    harden or catch that specific undefined behaviour or incorrect usage.
+  * unwinding and exception handling: the implementations are not hardened
+    against malformed or malicious unwind or exception handling data. This is
+    not considered security sensitive.
+
+Note that both the explicit security-sensitive and explicit non-security
+sensitive lists can change over time. If you're not sure whether an issue is
+in-scope for this security process or not, err towards assuming that it is. The
+Security Response Group might agree or disagree and will explain its rationale
+in the report, as well as update this document through the above process.
 
 .. _CVE process: https://cve.mitre.org
 .. _report a vulnerability: https://github.com/llvm/llvm-security-repo/security/advisories/new
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index 3e16fe42b7d11..19f199fb462e7 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -47,6 +47,7 @@ intermediate LLVM representation.
    InstCombineContributorGuide
    InstrProfileFormat
    InstrRefDebugInfo
+   KeyInstructionsDebugInfo
    LinkTimeOptimization
    LoopTerminology
    MarkdownQuickstartTemplate
@@ -193,6 +194,10 @@ Optimizations
    This is a migration guide describing how to move from debug info using
    intrinsics such as dbg.value to using the non-instruction DbgRecord object.
 
+:doc:`KeyInstructionsDebugInfo`
+   This document explains how the debug info feature Key Instructions is
+   implemented in LLVM.
+
 :doc:`InstrProfileFormat`
    This document explains two binary formats of instrumentation-based profiles.
 
diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h
index b1009f8b49992..1a2331c1a0322 100644
--- a/llvm/include/llvm/ADT/EquivalenceClasses.h
+++ b/llvm/include/llvm/ADT/EquivalenceClasses.h
@@ -218,12 +218,12 @@ template <class ElemTy> class EquivalenceClasses {
   /// insert - Insert a new value into the union/find set, ignoring the request
   /// if the value already exists.
   const ECValue &insert(const ElemTy &Data) {
-    auto I = TheMapping.insert({Data, nullptr});
-    if (!I.second)
-      return *I.first->second;
+    auto [I, Inserted] = TheMapping.try_emplace(Data);
+    if (!Inserted)
+      return *I->second;
 
     auto *ECV = new (ECValueAllocator) ECValue(Data);
-    I.first->second = ECV;
+    I->second = ECV;
     Members.push_back(ECV);
     return *ECV;
   }
diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h
index b3c4a414ed6b4..c089a070d4b57 100644
--- a/llvm/include/llvm/ADT/StringTable.h
+++ b/llvm/include/llvm/ADT/StringTable.h
@@ -85,14 +85,18 @@ class StringTable {
            "Last byte must be a null byte.");
   }
 
-  // Get a string from the table starting with the provided offset. The returned
-  // `StringRef` is in fact null terminated, and so can be converted safely to a
-  // C-string if necessary for a system API.
-  constexpr StringRef operator[](Offset O) const {
+  // Returns the raw C string from the table starting with the provided offset.
+  // The returned string is null terminated.
+  constexpr const char *getCString(Offset O) const {
     assert(O.value() < Table.size() && "Out of bounds offset!");
     return Table.data() + O.value();
   }
 
+  // Get a string from the table starting with the provided offset. The returned
+  // `StringRef` is in fact null terminated, and so can be converted safely to a
+  // C-string if necessary for a system API.
+  constexpr StringRef operator[](Offset O) const { return getCString(O); }
+
   /// Returns the byte size of the table.
   constexpr size_t size() const { return Table.size(); }
 
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 151fb0a5e8ac6..d87457cac7642 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -162,15 +162,6 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
       static_cast<unsigned>(OperandKind::MaxOperandKind);
 
-  /// Helper function to get vocabulary key for a given OperandKind
-  static StringRef getVocabKeyForOperandKind(OperandKind Kind);
-
-  /// Helper function to classify an operand into OperandKind
-  static OperandKind getOperandKind(const Value *Op);
-
-  /// Helper function to get vocabulary key for a given TypeID
-  static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
-
 public:
   Vocabulary() = default;
   Vocabulary(VocabVector &&Vocab);
@@ -179,6 +170,27 @@ class Vocabulary {
   unsigned getDimension() const;
   size_t size() const;
 
+  static size_t expectedSize() {
+    return MaxOpcodes + MaxTypeIDs + MaxOperandKinds;
+  }
+
+  /// Helper function to get vocabulary key for a given Opcode
+  static StringRef getVocabKeyForOpcode(unsigned Opcode);
+
+  /// Helper function to get vocabulary key for a given TypeID
+  static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
+
+  /// Helper function to get vocabulary key for a given OperandKind
+  static StringRef getVocabKeyForOperandKind(OperandKind Kind);
+
+  /// Helper function to classify an operand into OperandKind
+  static OperandKind getOperandKind(const Value *Op);
+
+  /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind
+  static unsigned getNumericID(unsigned Opcode);
+  static unsigned getNumericID(Type::TypeID TypeID);
+  static unsigned getNumericID(const Value *Op);
+
   /// Accessors to get the embedding for a given entity.
   const ir2vec::Embedding &operator[](unsigned Opcode) const;
   const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index af1e0d7251a4f..9a2773c06bae6 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -24,6 +24,7 @@
 
 namespace llvm {
 class TargetLibraryInfo;
+class IntrinsicInst;
 
 /// The Vector Function Database.
 ///
@@ -188,6 +189,10 @@ LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID);
 /// Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
 LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID);
 
+/// Given a deinterleaveN intrinsic, return the (narrow) vector type of each
+/// factor.
+LLVM_ABI VectorType *getDeinterleavedVectorType(IntrinsicInst *DI);
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index ebb257ab33821..6bf2e177b5d40 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1162,6 +1162,7 @@ enum : unsigned {
   // Android's experimental support for SHT_RELR sections.
   // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
   SHT_ANDROID_RELR = 0x6fffff00,   // Relocation entries; only offsets.
+  SHT_GNU_SFRAME = 0x6ffffff4,     // GNU SFrame stack trace format.
   SHT_GNU_ATTRIBUTES = 0x6ffffff5, // Object attributes.
   SHT_GNU_HASH = 0x6ffffff6,       // GNU-style hash table.
   SHT_GNU_verdef = 0x6ffffffd,     // GNU version definitions.
@@ -1546,6 +1547,7 @@ enum {
   PT_GNU_STACK = 0x6474e551,    // Indicates stack executability.
   PT_GNU_RELRO = 0x6474e552,    // Read-only after relocation.
   PT_GNU_PROPERTY = 0x6474e553, // .note.gnu.property notes sections.
+  PT_GNU_SFRAME = 0x6474e554,   // GNU SFrame stack trace format.
 
   PT_OPENBSD_MUTABLE = 0x65a3dbe5,   // Like bss, but not immutable.
   PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, // Fill with random data.
@@ -1968,7 +1970,6 @@ enum {
   GNU_ABI_TAG_FREEBSD = 3,
   GNU_ABI_TAG_NETBSD = 4,
   GNU_ABI_TAG_SYLLABLE = 5,
-  GNU_ABI_TAG_NACL = 6,
 };
 
 constexpr const char *ELF_NOTE_GNU = "GNU";
diff --git a/llvm/include/llvm/BinaryFormat/MinidumpConstants.def b/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
index 722a70ff67a9d..a04bb4c25f44e 100644
--- a/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
+++ b/llvm/include/llvm/BinaryFormat/MinidumpConstants.def
@@ -118,7 +118,6 @@ HANDLE_MDMP_PLATFORM(0x8201, Linux) // Linux
 HANDLE_MDMP_PLATFORM(0x8202, Solaris) // Solaris
 HANDLE_MDMP_PLATFORM(0x8203, Android) // Android
 HANDLE_MDMP_PLATFORM(0x8204, PS3) // PS3
-HANDLE_MDMP_PLATFORM(0x8205, NaCl) // Native Client (NaCl)
 HANDLE_MDMP_PLATFORM(0x8206, OpenHOS) // OpenHarmony OS
 
 HANDLE_MDMP_PROTECT(0x01, NoAccess, PAGE_NO_ACCESS)
diff --git a/llvm/include/llvm/BinaryFormat/SFrame.h b/llvm/include/llvm/BinaryFormat/SFrame.h
new file mode 100644
index 0000000000000..16d3b16c6c2d3
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/SFrame.h
@@ -0,0 +1,165 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains data-structure definitions and constants to support
+/// unwinding based on .sframe sections.  This only supports SFRAME_VERSION_2
+/// as described at https://sourceware.org/binutils/docs/sframe-spec.html
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINARYFORMAT_SFRAME_H
+#define LLVM_BINARYFORMAT_SFRAME_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Endian.h"
+
+namespace llvm::sframe {
+
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+constexpr uint16_t Magic = 0xdee2;
+
+enum class Version : uint8_t {
+  V1 = 1,
+  V2 = 2,
+};
+
+enum class Flags : uint8_t {
+  FDESorted = 0x01,
+  FramePointer = 0x02,
+  FDEFuncStartPCRel = 0x04,
+  V2AllFlags = FDESorted | FramePointer | FDEFuncStartPCRel,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/0xff),
+};
+
+enum class ABI : uint8_t {
+  AArch64EndianBig = 1,
+  AArch64EndianLittle = 2,
+  AMD64EndianLittle = 3,
+};
+
+/// SFrame FRE Types. Bits 0-3 of FuncDescEntry.Info.
+enum class FREType : uint8_t {
+  Addr1 = 0,
+  Addr2 = 1,
+  Addr4 = 2,
+};
+
+/// SFrame FDE Types. Bit 4 of FuncDescEntry.Info.
+enum class FDEType : uint8_t {
+  PCInc = 0,
+  PCMask = 1,
+};
+
+/// Speficies key used for signing return addresses. Bit 5 of
+/// FuncDescEntry.Info.
+enum class AArch64PAuthKey : uint8_t {
+  A = 0,
+  B = 1,
+};
+
+/// Size of stack offsets. Bits 5-6 of FREInfo.Info.
+enum class FREOffset : uint8_t {
+  B1 = 0,
+  B2 = 1,
+  B4 = 2,
+};
+
+/// Stack frame base register. Bit 0 of FREInfo.Info.
+enum class BaseReg : uint8_t {
+  FP = 0,
+  SP = 1,
+};
+
+namespace detail {
+template <typename T, endianness E>
+using packed =
+    support::detail::packed_endian_specific_integral<T, E, support::unaligned>;
+}
+
+template <endianness E> struct Preamble {
+  detail::packed<uint16_t, E> Magic;
+  detail::packed<enum Version, E> Version;
+  detail::packed<enum Flags, E> Flags;
+};
+
+template <endianness E> struct Header {
+  struct Preamble<E> Preamble;
+  detail::packed<ABI, E> ABIArch;
+  detail::packed<int8_t, E> CFAFixedFPOffset;
+  detail::packed<int8_t, E> CFAFixedRAOffset;
+  detail::packed<uint8_t, E> AuxHdrLen;
+  detail::packed<uint32_t, E> NumFDEs;
+  detail::packed<uint32_t, E> NumFREs;
+  detail::packed<uint32_t, E> FRELen;
+  detail::packed<uint32_t, E> FDEOff;
+  detail::packed<uint32_t, E> FREOff;
+};
+
+template <endianness E> struct FuncDescEntry {
+  detail::packed<int32_t, E> StartAddress;
+  detail::packed<uint32_t, E> Size;
+  detail::packed<uint32_t, E> StartFREOff;
+  detail::packed<uint32_t, E> NumFREs;
+  detail::packed<uint8_t, E> Info;
+  detail::packed<uint8_t, E> RepSize;
+  detail::packed<uint16_t, E> Padding2;
+
+  uint8_t getPAuthKey() const { return (Info >> 5) & 1; }
+  FDEType getFDEType() const { return static_cast<FDEType>((Info >> 4) & 1); }
+  FREType getFREType() const { return static_cast<FREType>(Info & 0xf); }
+  void setPAuthKey(uint8_t P) { setFuncInfo(P, getFDEType(), getFREType()); }
+  void setFDEType(FDEType D) { setFuncInfo(getPAuthKey(), D, getFREType()); }
+  void setFREType(FREType R) { setFuncInfo(getPAuthKey(), getFDEType(), R); }
+  void setFuncInfo(uint8_t PAuthKey, FDEType FDE, FREType FRE) {
+    Info = ((PAuthKey & 1) << 5) | ((static_cast<uint8_t>(FDE) & 1) << 4) |
+           (static_cast<uint8_t>(FRE) & 0xf);
+  }
+};
+
+template <endianness E> struct FREInfo {
+  detail::packed<uint8_t, E> Info;
+
+  bool isReturnAddressSigned() const { return Info >> 7; }
+  FREOffset getOffsetSize() const {
+    return static_cast<FREOffset>((Info >> 5) & 3);
+  }
+  uint8_t getOffsetCount() const { return (Info >> 1) & 0xf; }
+  BaseReg getBaseRegister() const { return static_cast<BaseReg>(Info & 1); }
+  void setReturnAddressSigned(bool RA) {
+    setFREInfo(RA, getOffsetSize(), getOffsetCount(), getBaseRegister());
+  }
+  void setOffsetSize(FREOffset Sz) {
+    setFREInfo(isReturnAddressSigned(), Sz, getOffsetCount(),
+               getBaseRegister());
+  }
+  void setOffsetCount(uint8_t N) {
+    setFREInfo(isReturnAddressSigned(), getOffsetSize(), N, getBaseRegister());
+  }
+  void setBaseRegister(BaseReg Reg) {
+    setFREInfo(isReturnAddressSigned(), getOffsetSize(), getOffsetCount(), Reg);
+  }
+  void setFREInfo(bool RA, FREOffset Sz, uint8_t N, BaseReg Reg) {
+    Info = ((RA & 1) << 7) | ((static_cast<uint8_t>(Sz) & 3) << 5) |
+           ((N & 0xf) << 1) | (static_cast<uint8_t>(Reg) & 1);
+  }
+};
+
+template <typename T, endianness E> struct FrameRowEntry {
+  detail::packed<T, E> StartAddress;
+  FREInfo<E> Info;
+};
+
+template <endianness E> using FrameRowEntryAddr1 = FrameRowEntry<uint8_t, E>;
+template <endianness E> using FrameRowEntryAddr2 = FrameRowEntry<uint16_t, E>;
+template <endianness E> using FrameRowEntryAddr4 = FrameRowEntry<uint32_t, E>;
+
+} // namespace llvm::sframe
+
+#endif // LLVM_BINARYFORMAT_SFRAME_H
diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
index feb05de20b457..3a7eda68cdc27 100644
--- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
+++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
@@ -35,8 +35,7 @@ using CreateCmpXchgInstFun = function_ref<void(
 /// instructions directly into a platform specific intrinsics (because, say,
 /// those intrinsics don't exist). If such a pass is able to expand cmpxchg
 /// instructions directly however, then, with this function, it could avoid two
-/// extra module passes (avoiding passes by `-atomic-expand` and itself). A
-/// specific example would be PNaCl's `RewriteAtomics` pass.
+/// extra module passes (avoiding passes by `-atomic-expand` and itself).
 ///
 /// Given: atomicrmw some_op iN* %addr, iN %incr ordering
 ///
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index f6936b98bf3e4..1d7c41452f7d5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1369,8 +1369,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
 
-    // TODO: Handle other cost kinds.
-    if (CostKind != TTI::TCK_RecipThroughput)
+    if (getTLI()->getValueType(DL, ValTy, true) == MVT::Other)
       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
                                        Op1Info, Op2Info, I);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 31f1197b9723b..da829046cc421 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -700,18 +700,19 @@ class CombinerHelper {
   /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
   /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
+  MachineInstr *buildUDivOrURemUsingMul(MachineInstr &MI) const;
   /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
-  bool matchUDivorURemByConst(MachineInstr &MI) const;
-  void applyUDivorURemByConst(MachineInstr &MI) const;
-
-  /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
-  /// expression that implements it by multiplying by a magic number.
-  /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
-  /// Combine G_SDIV by constant into a multiply by magic constant.
-  bool matchSDivByConst(MachineInstr &MI) const;
-  void applySDivByConst(MachineInstr &MI) const;
+  bool matchUDivOrURemByConst(MachineInstr &MI) const;
+  void applyUDivOrURemByConst(MachineInstr &MI) const;
+
+  /// Given an G_SDIV \p MI or G_SREM \p MI expressing a signed divide by
+  /// constant, return an expression that implements it by multiplying by a
+  /// magic number. Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's
+  /// Guide".
+  MachineInstr *buildSDivOrSRemUsingMul(MachineInstr &MI) const;
+  /// Combine G_SDIV or G_SREM by constant into a multiply by magic constant.
+  bool matchSDivOrSRemByConst(MachineInstr &MI) const;
+  void applySDivOrSRemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant,
   /// return expressions that implements it by shifting.
diff --git a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
index 98a60c987bbe3..1d954cf60c68c 100644
--- a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
@@ -46,6 +46,11 @@ class MachineFunctionAnalysis
   LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM);
 };
 
+class FreeMachineFunctionPass : public PassInfoMixin<FreeMachineFunctionPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_MachineFunctionAnalysis
diff --git a/llvm/include/llvm/CodeGen/PostRAMachineSink.h b/llvm/include/llvm/CodeGen/PostRAMachineSink.h
index 2f1f79a0480df..4ab088f17cafc 100644
--- a/llvm/include/llvm/CodeGen/PostRAMachineSink.h
+++ b/llvm/include/llvm/CodeGen/PostRAMachineSink.h
@@ -20,8 +20,7 @@ class PostRAMachineSinkingPass
                         MachineFunctionAnalysisManager &MFAM);
 
   MachineFunctionProperties getRequiredProperties() const {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
+    return MachineFunctionProperties().setNoVRegs();
   }
 };
 
diff --git a/llvm/include/llvm/CodeGen/ProcessImplicitDefs.h b/llvm/include/llvm/CodeGen/ProcessImplicitDefs.h
new file mode 100644
index 0000000000000..12a97d3efec66
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/ProcessImplicitDefs.h
@@ -0,0 +1,27 @@
+//===- llvm/CodeGen/ProcessImplicitDefs.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PROCESSIMPLICITDEFS_H
+#define LLVM_CODEGEN_PROCESSIMPLICITDEFS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class ProcessImplicitDefsPass : public PassInfoMixin<ProcessImplicitDefsPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  MachineFunctionProperties getRequiredProperties() const {
+    return MachineFunctionProperties().setIsSSA();
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_PROCESSIMPLICITDEFS_H
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a248eb7444b20..1a548a536f088 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3201,11 +3201,15 @@ class LLVM_ABI TargetLoweringBase {
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
-  /// \p LI is the vector load instruction.
+  /// \p Load is the vector load instruction. Can be either a plain load
+  /// instruction or a vp.load intrinsic.
+  /// \p Mask is a per-segment (i.e. number of lanes equal to that of one
+  /// component being interwoven) mask.  Can be nullptr, in which case the
+  /// result is uncondiitional.
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
-  virtual bool lowerInterleavedLoad(LoadInst *LI,
+  virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
                                     ArrayRef<unsigned> Indices,
                                     unsigned Factor) const {
@@ -3223,17 +3227,6 @@ class LLVM_ABI TargetLoweringBase {
     return false;
   }
 
-  /// Lower an interleaved load to target specific intrinsics. Return
-  /// true on success.
-  ///
-  /// \p Load is a vp.load instruction.
-  /// \p Mask is a mask value
-  /// \p DeinterleaveRes is a list of deinterleaved results.
-  virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      ArrayRef<Value *> DeinterleaveRes) const {
-    return false;
-  }
-
   /// Lower an interleaved store to target specific intrinsics. Return
   /// true on success.
   ///
@@ -3249,11 +3242,11 @@ class LLVM_ABI TargetLoweringBase {
   /// Return true on success. Currently only supports
   /// llvm.vector.deinterleave{2,3,5,7}
   ///
-  /// \p LI is the accompanying load instruction.
-  /// \p DeinterleaveValues contains the deinterleaved values.
-  virtual bool
-  lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
-                                   ArrayRef<Value *> DeinterleaveValues) const {
+  /// \p Load is the accompanying load instruction.  Can be either a plain load
+  /// instruction or a vp.load intrinsic.
+  /// \p DI represents the deinterleaveN intrinsic.
+  virtual bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+                                                IntrinsicInst *DI) const {
     return false;
   }
 
@@ -3261,10 +3254,14 @@ class LLVM_ABI TargetLoweringBase {
   /// Return true on success. Currently only supports
   /// llvm.vector.interleave{2,3,5,7}
   ///
-  /// \p SI is the accompanying store instruction
+  /// \p Store is the accompanying store instruction.  Can be either a plain
+  /// store or a vp.store intrinsic.
+  /// \p Mask is a per-segment (i.e. number of lanes equal to that of one
+  /// component being interwoven) mask.  Can be nullptr, in which case the
+  /// result is uncondiitional.
   /// \p InterleaveValues contains the interleaved values.
   virtual bool
-  lowerInterleaveIntrinsicToStore(StoreInst *SI,
+  lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask,
                                   ArrayRef<Value *> InterleaveValues) const {
     return false;
   }
diff --git a/llvm/include/llvm/CodeGen/VirtRegMap.h b/llvm/include/llvm/CodeGen/VirtRegMap.h
index b5f962e0fb821..405ae3905ebe5 100644
--- a/llvm/include/llvm/CodeGen/VirtRegMap.h
+++ b/llvm/include/llvm/CodeGen/VirtRegMap.h
@@ -253,10 +253,8 @@ class VirtRegRewriterPass : public PassInfoMixin<VirtRegRewriterPass> {
                               function_ref<StringRef(StringRef)>) const;
 
   MachineFunctionProperties getSetProperties() const {
-    if (ClearVirtRegs) {
-      return MachineFunctionProperties().set(
-          MachineFunctionProperties::Property::NoVRegs);
-    }
+    if (ClearVirtRegs)
+      return MachineFunctionProperties().setNoVRegs();
     return {};
   }
 };
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index cdc80c88b7425..611bfe3f8aced 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -795,25 +795,9 @@ bool ConstructDecompositionT<C, H>::applyClause(
   // assigned to which leaf constructs.
 
   // [5.2:340:33]
-  auto canMakePrivateCopy = [](llvm::omp::Clause id) {
-    switch (id) {
-    // Clauses with "privatization" property:
-    case llvm::omp::Clause::OMPC_firstprivate:
-    case llvm::omp::Clause::OMPC_in_reduction:
-    case llvm::omp::Clause::OMPC_lastprivate:
-    case llvm::omp::Clause::OMPC_linear:
-    case llvm::omp::Clause::OMPC_private:
-    case llvm::omp::Clause::OMPC_reduction:
-    case llvm::omp::Clause::OMPC_task_reduction:
-      return true;
-    default:
-      return false;
-    }
-  };
-
   bool applied = applyIf(node, [&](const auto &leaf) {
     return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
-      return canMakePrivateCopy(n->id);
+      return llvm::omp::isPrivatizingClause(n->id);
     });
   });
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h
index 35dafc6d246f0..9d0a55432e1ae 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h
@@ -48,6 +48,26 @@ static constexpr inline bool canHaveIterator(Clause C) {
   }
 }
 
+// Can clause C create a private copy of a variable.
+static constexpr inline bool isPrivatizingClause(Clause C) {
+  switch (C) {
+  case OMPC_detach:
+  case OMPC_firstprivate:
+  // TODO case OMPC_induction:
+  case OMPC_in_reduction:
+  case OMPC_is_device_ptr:
+  case OMPC_lastprivate:
+  case OMPC_linear:
+  case OMPC_private:
+  case OMPC_reduction:
+  case OMPC_task_reduction:
+  case OMPC_use_device_ptr:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static constexpr unsigned FallbackVersion = 52;
 LLVM_ABI ArrayRef<unsigned> getOpenMPVersions();
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 19a4058b64382..206ad4a4ef85f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -34,6 +34,9 @@ class CanonicalLoopInfo;
 struct TargetRegionEntryInfo;
 class OffloadEntriesInfoManager;
 class OpenMPIRBuilder;
+class Loop;
+class LoopAnalysis;
+class LoopInfo;
 
 /// Move the instruction after an InsertPoint to the beginning of another
 /// BasicBlock.
@@ -1114,6 +1117,7 @@ class OpenMPIRBuilder {
   /// \param NamePrefix Optional name prefix for if.then if.else blocks.
   void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
                        ValueMap<const Value *, WeakTrackingVH> &VMap,
+                       LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
                        const Twine &NamePrefix = "");
 
 public:
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 77cee875f16e7..f8241a3cdf160 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -115,8 +115,7 @@ class DebugInfoFinder {
   LLVM_ABI void processVariable(DILocalVariable *DVI);
   /// Process debug info location.
   LLVM_ABI void processLocation(const Module &M, const DILocation *Loc);
-  /// Process a DbgRecord (e.g, treat a DbgVariableRecord like a
-  /// DbgVariableIntrinsic).
+  /// Process a DbgRecord.
   LLVM_ABI void processDbgRecord(const Module &M, const DbgRecord &DR);
 
   /// Process subprogram.
@@ -290,8 +289,6 @@ struct VarRecord {
   DILocalVariable *Var;
   DILocation *DL;
 
-  VarRecord(DbgVariableIntrinsic *DVI)
-      : Var(DVI->getVariable()), DL(getDebugValueLoc(DVI)) {}
   VarRecord(DbgVariableRecord *DVR)
       : Var(DVR->getVariable()), DL(getDebugValueLoc(DVR)) {}
   VarRecord(DILocalVariable *Var, DILocation *DL) : Var(Var), DL(DL) {}
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 9345f95015301..f1f0c18949c35 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -66,7 +66,6 @@ namespace dwarf {
 enum Tag : uint16_t;
 }
 
-class DbgVariableIntrinsic;
 class DbgVariableRecord;
 
 LLVM_ABI extern cl::opt<bool> EnableFSDiscriminator;
@@ -4613,7 +4612,6 @@ class DebugVariable {
   LLVM_ABI static const FragmentInfo DefaultFragment;
 
 public:
-  LLVM_ABI DebugVariable(const DbgVariableIntrinsic *DII);
   LLVM_ABI DebugVariable(const DbgVariableRecord *DVR);
 
   DebugVariable(const DILocalVariable *Var,
@@ -4681,7 +4679,6 @@ template <> struct DenseMapInfo<DebugVariable> {
 /// information).
 class DebugVariableAggregate : public DebugVariable {
 public:
-  LLVM_ABI DebugVariableAggregate(const DbgVariableIntrinsic *DVI);
   DebugVariableAggregate(const DebugVariable &V)
       : DebugVariable(V.getVariable(), std::nullopt, V.getInlinedAt()) {}
 };
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index ef382a9168f24..5d25804a684ac 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -898,28 +898,6 @@ class Instruction : public User,
   /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
   LLVM_ABI bool isDebugOrPseudoInst() const LLVM_READONLY;
 
-  /// Return a pointer to the next non-debug instruction in the same basic
-  /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
-  /// operations if \c SkipPseudoOp is true.
-  LLVM_ABI const Instruction *
-  getNextNonDebugInstruction(bool SkipPseudoOp = false) const;
-  Instruction *getNextNonDebugInstruction(bool SkipPseudoOp = false) {
-    return const_cast<Instruction *>(
-        static_cast<const Instruction *>(this)->getNextNonDebugInstruction(
-            SkipPseudoOp));
-  }
-
-  /// Return a pointer to the previous non-debug instruction in the same basic
-  /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
-  /// operations if \c SkipPseudoOp is true.
-  LLVM_ABI const Instruction *
-  getPrevNonDebugInstruction(bool SkipPseudoOp = false) const;
-  Instruction *getPrevNonDebugInstruction(bool SkipPseudoOp = false) {
-    return const_cast<Instruction *>(
-        static_cast<const Instruction *>(this)->getPrevNonDebugInstruction(
-            SkipPseudoOp));
-  }
-
   /// Create a copy of 'this' instruction that is identical in all ways except
   /// the following:
   ///   * The instruction has no parent
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index d8f8bdeb01d43..0318427bf8a1c 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -54,7 +54,7 @@ class IntrinsicInst : public CallInst {
 
   /// Return the intrinsic ID of this intrinsic.
   Intrinsic::ID getIntrinsicID() const {
-    return getCalledFunction()->getIntrinsicID();
+    return cast<Function>(getCalledOperand())->getIntrinsicID();
   }
 
   bool isAssociative() const {
@@ -131,9 +131,8 @@ class IntrinsicInst : public CallInst {
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const CallInst *I) {
-    if (const Function *CF = I->getCalledFunction())
-      return CF->isIntrinsic();
-    return false;
+    auto *F = dyn_cast_or_null<Function>(I->getCalledOperand());
+    return F && F->isIntrinsic();
   }
   static bool classof(const Value *V) {
     return isa<CallInst>(V) && classof(cast<CallInst>(V));
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 962693003349e..d8fda0e2bcfa3 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1936,7 +1936,9 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
    ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
 def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
 
-class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
+class AMDGPUStructPtrBufferLoadLDS :
+  ClangBuiltin<"__builtin_amdgcn_struct_ptr_buffer_load_lds">,
+  Intrinsic <
   [],
   [AMDGPUBufferRsrcTy,        // rsrc(SGPR)
    LLVMQualPointerType<3>,    // LDS base offset
@@ -2919,6 +2921,20 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
 // the form: D = A * B + C.
 // A is sparse matrix, half the size of B, and is expanded using sparsity index.
 
+class AMDGPUSWmmacIntrinsicIdxReuse<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      A,                // %A
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index,            // %Sparsity index for A
+      llvm_i1_ty,       // matrix_a_reuse
+      llvm_i1_ty,       // matrix_b_reuse
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]
+>;
+
 class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
   Intrinsic<
     [CD],               // %D
@@ -3602,6 +3618,160 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
   [IntrNoMem, IntrSpeculatable]
 >;
 
+// WMMA intrinsics.
+class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
+  Intrinsic<
+    [CD], // %D
+    [
+      llvm_i1_ty,       // %A_mod: 0 -- none, 1 -- neg
+      AB,               // %A
+      llvm_i1_ty,       // %B_mod: 0 -- none, 1 -- neg
+      LLVMMatchType<1>, // %B
+      LLVMMatchType<0>,               // %C
+      llvm_i1_ty,       // matrix_a_reuse
+      llvm_i1_ty,       // matrix_b_reuse
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
+     IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
+class AMDGPUWmmaIntrinsicModsC<LLVMType AB, LLVMType CD> :
+  Intrinsic<
+    [CD], // %D
+    [
+      AB,               // %A
+      LLVMMatchType<1>, // %B
+      llvm_i16_ty,      // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+      LLVMMatchType<0>,               // %C
+      llvm_i1_ty,       // matrix_a_reuse
+      llvm_i1_ty,       // matrix_b_reuse
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>,
+     IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
+class AMDGPUWmmaIntrinsicF4ModsC<LLVMType A, LLVMType B, LLVMType CD> :
+  Intrinsic<
+    [CD], // %D
+    [
+      A,                // %A
+      B,                // %B
+      llvm_i16_ty,      // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+      LLVMMatchType<0>,               // %C
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
+class AMDGPUWmmaIntrinsicModsAll<LLVMType AB, LLVMType CD> :
+  Intrinsic<
+    [CD], // %D
+    [
+      llvm_i1_ty,       // %A_mod: 0 -- none, 1 -- neg
+      AB,               // %A
+      llvm_i1_ty,       // %B_mod: 0 -- none, 1 -- neg
+      LLVMMatchType<1>, // %B
+      llvm_i16_ty,      // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
+      LLVMMatchType<0>,               // %C
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
+class AMDGPUWmmaIntrinsicModsAllReuse<LLVMType AB, LLVMType CD> :
+  Intrinsic<
+    [CD], // %D
+    [
+      llvm_i1_ty,       // %A_mod: 0 -- none, 1 -- neg
+      AB,               // %A
+      llvm_i1_ty,       // %B_mod: 0 -- none, 1 -- neg
+      LLVMMatchType<1>, // %B
+      llvm_i16_ty,      // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
+      LLVMMatchType<0>,               // %C
+      llvm_i1_ty,       // matrix_a_reuse
+      llvm_i1_ty,       // matrix_b_reuse
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
+     IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
+// D and C are of different types.
+class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
+  Intrinsic<
+    [DstTy],     // %D
+    [
+      llvm_i1_ty,        // %A_mod: 0 -- none, 1 -- neg
+      AB,                // %A
+      llvm_i1_ty,        // %B_mod: 0 -- none, 1 -- neg
+      LLVMMatchType<1>,  // %B
+      llvm_i16_ty,       // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
+      C,                 // %C
+      llvm_i1_ty,        // matrix_a_reuse
+      llvm_i1_ty,        // matrix_b_reuse
+    ],
+    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
+     IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
+defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
+def int_amdgcn_wmma_f32_16x16x4_f32       : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x32_bf16     : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x32_f16      : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x32_f16      : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x32_bf16    : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllDiff<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x64_fp8_fp8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x64_fp8_bf8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x64_bf8_fp8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x64_bf8_bf8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x64_fp8_fp8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x64_fp8_bf8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x64_bf8_fp8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x64_bf8_bf8  : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_i32_16x16x64_iu8      : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_f32_32x16x128_f4       : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
+}
+
+class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD], // %D
+    [
+      llvm_i1_ty,       // %A_mod:  0 - none, 1 - neg
+      A,                // %A
+      llvm_i1_ty,       // %B_mod:  0 - none, 1 - neg
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index,            // %Sparsity index for A
+      llvm_i1_ty,       // matrix_a_reuse
+      llvm_i1_ty,       // matrix_b_reuse
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>]
+>;
+
+defset list<Intrinsic> AMDGPUSWMMACIntrinsicsGFX1250 = {
+def int_amdgcn_swmmac_f32_16x16x64_f16      : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x64_bf16     : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x64_f16      : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_bf16_16x16x64_bf16    : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_bf16f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x128_iu8     : AMDGPUSWmmacIntrinsicABIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+}
+
+
 class AMDGPUTensorLoadStore:
   Intrinsic<
     [],
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 35c9cd63581d6..b5f0cdf479c08 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -90,7 +90,12 @@ let TargetPrefix = "spv" in {
   def int_spv_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_reflect : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>;
-  def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
+  def int_spv_refract
+      : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                              [llvm_anyfloat_ty, LLVMMatchType<0>,
+                              llvm_anyfloat_ty],
+                              [IntrNoMem]>;
+def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_smoothstep : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
   def int_spv_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [LLVMMatchType<0>, llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/OptBisect.h b/llvm/include/llvm/IR/OptBisect.h
index ea3c1defeb100..d813ae933d65e 100644
--- a/llvm/include/llvm/IR/OptBisect.h
+++ b/llvm/include/llvm/IR/OptBisect.h
@@ -15,6 +15,7 @@
 #define LLVM_IR_OPTBISECT_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Compiler.h"
 #include <limits>
 
@@ -82,8 +83,38 @@ class LLVM_ABI OptBisect : public OptPassGate {
   mutable int LastBisectNum = 0;
 };
 
-/// Singleton instance of the OptBisect class, so multiple pass managers don't
-/// need to coordinate their uses of OptBisect.
+/// This class implements a mechanism to disable passes and individual
+/// optimizations at compile time based on a command line option
+/// (-opt-disable) in order to study how single transformations, or
+/// combinations thereof, affect the IR.
+class LLVM_ABI OptDisable : public OptPassGate {
+public:
+  /// Checks the pass name to determine if the specified pass should run.
+  ///
+  /// It returns true if the pass should run, i.e. if its name is was
+  /// not provided via command line.
+  /// If -opt-disable-enable-verbosity is given, the method prints the
+  /// name of the pass, and whether or not the pass will be executed.
+  ///
+  /// Most passes should not call this routine directly. Instead, it is called
+  /// through helper routines provided by the base classes of the pass. For
+  /// instance, function passes should call FunctionPass::skipFunction().
+  bool shouldRunPass(StringRef PassName,
+                     StringRef IRDescription) const override;
+
+  /// Parses the command line argument to extract the names of the passes
+  /// to be disabled. Multiple pass names can be provided with comma separation.
+  void setDisabled(StringRef Pass);
+
+  /// isEnabled() should return true before calling shouldRunPass().
+  bool isEnabled() const override { return !DisabledPasses.empty(); }
+
+private:
+  StringSet<> DisabledPasses = {};
+};
+
+/// Singleton instance of the OptPassGate class, so multiple pass managers don't
+/// need to coordinate their uses of OptBisect and OptDisable.
 LLVM_ABI OptPassGate &getGlobalPassGate();
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 4f44ae56eb3c7..ea8226c6e17ba 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -491,6 +491,22 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
   /// invalidate them, unless they are preserved by the PreservedAnalyses set.
   void invalidate(IRUnitT &IR, const PreservedAnalyses &PA);
 
+  /// Directly clear a cached analysis for an IR unit.
+  ///
+  /// Using invalidate() over this is preferred unless you are really
+  /// sure you want to *only* clear this analysis without asking if it is
+  /// invalid.
+  template <typename AnalysisT> void clearAnalysis(IRUnitT &IR) {
+    AnalysisResultListT &ResultsList = AnalysisResultLists[&IR];
+    AnalysisKey *ID = AnalysisT::ID();
+
+    auto I =
+        llvm::find_if(ResultsList, [&ID](auto &E) { return E.first == ID; });
+    assert(I != ResultsList.end() && "Analysis must be available");
+    ResultsList.erase(I);
+    AnalysisResults.erase({ID, &IR});
+  }
+
 private:
   /// Look up a registered analysis pass.
   PassConceptT &lookUpPass(AnalysisKey *ID) {
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index ed9b83d5d4361..50e50a91389e2 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2633,7 +2633,7 @@ struct IntrinsicID_match {
 
   template <typename OpTy> bool match(OpTy *V) const {
     if (const auto *CI = dyn_cast<CallInst>(V))
-      if (const auto *F = CI->getCalledFunction())
+      if (const auto *F = dyn_cast_or_null<Function>(CI->getCalledOperand()))
         return F->getIntrinsicID() == ID;
     return false;
   }
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 162b0fa7d2e9b..8058c8a4c5510 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringTable.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/Support/AtomicOrdering.h"
@@ -77,12 +78,16 @@ struct RuntimeLibcallsInfo {
   /// Get the libcall routine name for the specified libcall.
   // FIXME: This should be removed. Only LibcallImpl should have a name.
   const char *getLibcallName(RTLIB::Libcall Call) const {
-    return LibCallImplNames[LibcallImpls[Call]];
+    return getLibcallImplName(LibcallImpls[Call]);
   }
 
   /// Get the libcall routine name for the specified libcall implementation.
+  // FIXME: Change to return StringRef
   static const char *getLibcallImplName(RTLIB::LibcallImpl CallImpl) {
-    return LibCallImplNames[CallImpl];
+    if (CallImpl == RTLIB::Unsupported)
+      return nullptr;
+    return RuntimeLibcallImplNameTable[RuntimeLibcallNameOffsetTable[CallImpl]]
+        .data();
   }
 
   /// Return the lowering's selection of implementation call for \p Call
@@ -144,7 +149,9 @@ struct RuntimeLibcallsInfo {
 
   /// Names of concrete implementations of runtime calls. e.g. __ashlsi3 for
   /// SHL_I32
-  LLVM_ABI static const char *const LibCallImplNames[RTLIB::NumLibcallImpls];
+  LLVM_ABI static const char RuntimeLibcallImplNameTableStorage[];
+  LLVM_ABI static const StringTable RuntimeLibcallImplNameTable;
+  LLVM_ABI static const uint16_t RuntimeLibcallNameOffsetTable[];
 
   /// Map from a concrete LibcallImpl implementation to its RTLIB::Libcall kind.
   LLVM_ABI static const RTLIB::Libcall ImplToLibcall[RTLIB::NumLibcallImpls];
@@ -174,6 +181,10 @@ struct RuntimeLibcallsInfo {
            (TT.isAndroid() && !TT.isAndroidVersionLT(9));
   }
 
+  static bool hasSinCos_f32_f64(const Triple &TT) {
+    return hasSinCos(TT) || TT.isPS();
+  }
+
   LLVM_ABI void initDefaultLibCallImpls();
 
   /// Generated by tablegen.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index ff343f30f0325..f0297cd1a0873 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -17,12 +17,23 @@ class DuplicateLibcallImplWithPrefix<RuntimeLibcallImpl Impl, string prefix>
 
 /// Libcall Predicates
 def isOSDarwin : RuntimeLibcallPredicate<"TT.isOSDarwin()">;
+def isOSOpenBSD : RuntimeLibcallPredicate<"TT.isOSOpenBSD()">;
 def isOSWindows : RuntimeLibcallPredicate<"TT.isOSWindows()">;
+def isNotOSWindows : RuntimeLibcallPredicate<"!TT.isOSWindows()">;
+def isNotOSMSVCRT : RuntimeLibcallPredicate<"!TT.isOSMSVCRT()">;
+def isPS : RuntimeLibcallPredicate<"TT.isPS()">;
+def isNotOSWindowsOrIsCygwinMinGW
+  : RuntimeLibcallPredicate<"!TT.isOSWindows() || TT.isOSCygMing()">;
 
+
+def isGNUEnvironment : RuntimeLibcallPredicate<"TT.isGNUEnvironment()">;
 def darwinHasSinCosStret : RuntimeLibcallPredicate<"darwinHasSinCosStret(TT)">;
 def darwinHasExp10 : RuntimeLibcallPredicate<"darwinHasExp10(TT)">;
 def hasSinCos : RuntimeLibcallPredicate<"hasSinCos(TT)">;
 
+// FIXME: Way to combine predicates
+def hasSinCos_f32_f64 : RuntimeLibcallPredicate<"hasSinCos_f32_f64(TT)">;
+
 //--------------------------------------------------------------------
 // Declare all kinds of used libcalls
 //--------------------------------------------------------------------
@@ -275,6 +286,9 @@ foreach MemSize = [1, 2, 4, 8, 16] in {
 
 // Exception handling
 def UNWIND_RESUME : RuntimeLibcall;
+def UNWIND_REGISTER : RuntimeLibcall;
+def UNWIND_UNREGISTER : RuntimeLibcall;
+def UNWIND_CALL_PERSONALITY : RuntimeLibcall;
 def CXA_END_CLEANUP : RuntimeLibcall;
 
 // Note: there are two sets of atomics libcalls; see
@@ -346,6 +360,10 @@ defset list<RuntimeLibcall> LibCalls__OutOfLineAtomic = {
 
 // Stack Protector Fail
 def STACKPROTECTOR_CHECK_FAIL : RuntimeLibcall;
+def STACK_SMASH_HANDLER : RuntimeLibcall;
+
+// Safe stack
+def SAFESTACK_POINTER_ADDRESS : RuntimeLibcall;
 
 // Deoptimization
 def DEOPTIMIZE : RuntimeLibcall;
@@ -514,11 +532,13 @@ def __divxf3 : RuntimeLibcallImpl<DIV_F80>;
 def __divtf3 : RuntimeLibcallImpl<DIV_F128>;
 def __gcc_qdiv : RuntimeLibcallImpl<DIV_PPCF128>;
 
+defset list<RuntimeLibcallImpl> PowiLibcallImpls = {
 def __powisf2 : RuntimeLibcallImpl<POWI_F32>;
 def __powidf2 : RuntimeLibcallImpl<POWI_F64>;
 def __powixf2 : RuntimeLibcallImpl<POWI_F80>;
 def __powitf2_f128 : RuntimeLibcallImpl<POWI_F128, "__powitf2">;
 def __powitf2_ppc128 : RuntimeLibcallImpl<POWI_PPCF128, "__powitf2">;
+}
 
 // Conversion
 def __extendbfsf2 : RuntimeLibcallImpl<FPEXT_BF16_F32>;
@@ -685,6 +705,9 @@ foreach lc = LibCalls__atomic in {
 // Stack Protector Fail
 def __stack_chk_fail : RuntimeLibcallImpl<STACKPROTECTOR_CHECK_FAIL>;
 
+// Safe stack.
+def __safestack_pointer_address : RuntimeLibcallImpl<SAFESTACK_POINTER_ADDRESS>;
+
 // Deoptimization
 def __llvm_deoptimize : RuntimeLibcallImpl<DEOPTIMIZE>;
 
@@ -973,6 +996,12 @@ defm sincos : LibmLongDoubleLibCall;
 def bzero : RuntimeLibcallImpl<BZERO>;
 def __bzero : RuntimeLibcallImpl<BZERO>;
 def _Unwind_SjLj_Resume : RuntimeLibcallImpl<UNWIND_RESUME>;
+def _Unwind_SjLj_Register : RuntimeLibcallImpl<UNWIND_REGISTER>;
+def _Unwind_SjLj_Unregister : RuntimeLibcallImpl<UNWIND_UNREGISTER>;
+def _Unwind_CallPersonality : RuntimeLibcallImpl<UNWIND_CALL_PERSONALITY>;
+
+// Used on OpenBSD
+def __stack_smash_handler : RuntimeLibcallImpl<STACK_SMASH_HANDLER>;
 
 def __riscv_flush_icache : RuntimeLibcallImpl<RISCV_FLUSH_ICACHE>;
 
@@ -1084,6 +1113,29 @@ defvar LibmHasSinCosF80 = LibcallImpls<(add sincos_f80), hasSinCos>;
 defvar LibmHasSinCosF128 = LibcallImpls<(add sincos_f128), hasSinCos>;
 defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincos_ppcf128), hasSinCos>;
 
+defvar WindowsMathRemovals = [
+  ldexpf, ldexp_f80, ldexp_f128, ldexp_ppcf128,
+  frexpf, frexp_f80, frexp_f128, frexp_ppcf128
+];
+
+defvar MostPowI = !listremove(PowiLibcallImpls, [__powitf2_f128, __powitf2_ppc128]);
+defvar WindowsExclusions = !listconcat(WindowsMathRemovals, MostPowI);
+
+// Targets which support windows should start with these as a base and
+// add in calls for other OSes
+defvar Win32DefaultLibcallImpls = !listremove(DefaultLibcallImpls32, WindowsExclusions);
+defvar Win64DefaultLibcallImpls = !listremove(DefaultLibcallImpls64, WindowsExclusions);
+
+defvar LibmHasFrexpF32 = LibcallImpls<(add frexpf), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF32 = LibcallImpls<(add ldexpf), isNotOSWindowsOrIsCygwinMinGW>;
+
+defvar LibmHasFrexpF80 = LibcallImpls<(add frexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF80 = LibcallImpls<(add ldexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
+
+defvar LibmHasFrexpF128 = LibcallImpls<(add frexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF128 = LibcallImpls<(add ldexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
+
+
 //===----------------------------------------------------------------------===//
 // Objective-C Runtime Libcalls
 //===----------------------------------------------------------------------===//
@@ -1151,7 +1203,10 @@ def isAArch64_ILP64  : RuntimeLibcallPredicate<"TT.isAArch64(64)">;
 
 def AArch64SystemLibrary : SystemRuntimeLibrary<
   isAArch64_ExceptArm64EC,
-  (add DefaultRuntimeLibcallImpls,
+  (add Win64DefaultLibcallImpls,
+       LibcallImpls<(add __powisf2, __powidf2), isNotOSMSVCRT>,
+       LibmHasFrexpF32, LibmHasLdexpF32,
+       LibmHasFrexpF128, LibmHasLdexpF128,
        AArch64LibcallImpls,
        LibcallImpls<(add Int128RTLibcalls), isAArch64_ILP64>,
        LibcallImpls<(add bzero), isOSDarwin>,
@@ -1161,7 +1216,7 @@ def AArch64SystemLibrary : SystemRuntimeLibrary<
 
 // Prepend a # to every name
 defset list<RuntimeLibcallImpl> WinArm64ECDefaultRuntimeLibcallImpls = {
-  foreach libcall = DefaultLibcallImpls64 in {
+  foreach libcall = Win64DefaultLibcallImpls in {
     def arm64ec_#libcall : DuplicateLibcallImplWithPrefix<libcall, "#">;
   }
 
@@ -1418,11 +1473,15 @@ def isARMOrThumb : RuntimeLibcallPredicate<"TT.isARM() || TT.isThumb()">;
 
 def ARMSystemLibrary
     : SystemRuntimeLibrary<isARMOrThumb,
-      (add DefaultLibcallImpls32,
+      (add Win32DefaultLibcallImpls,
+           LibcallImpls<(add __powisf2, __powidf2), isNotOSMSVCRT>,
+           LibmHasFrexpF32, LibmHasLdexpF32,
+           LibmHasFrexpF128, LibmHasLdexpF128,
            WindowARMDivRemCalls,
            WindowARMFPIntCasts,
            AEABIDivRemCalls,
            DarwinSinCosStret, DarwinExp10,
+           LibmHasSinCosF32, LibmHasSinCosF64, LibmHasSinCosF128,
 
            // Use divmod compiler-rt calls for iOS 5.0 and later.
            LibcallImpls<(add __divmodsi4, __udivmodsi4),
@@ -1481,6 +1540,15 @@ def AVRSystemLibrary
               // Trigonometric rtlib functions
               avr_sin, avr_cos)>;
 
+//===----------------------------------------------------------------------===//
+// DXIL Runtime Libcalls
+//===----------------------------------------------------------------------===//
+
+def isDXIL : RuntimeLibcallPredicate<"TT.isDXIL()">;
+
+// No calls
+def DXILSystemLibrary : SystemRuntimeLibrary<isDXIL, (add)>;
+
 //===----------------------------------------------------------------------===//
 // Hexagon Runtime Libcalls
 //===----------------------------------------------------------------------===//
@@ -1916,6 +1984,8 @@ def PPCSystemLibrary
            __extendkftf2, __trunctfkf2,
            DefaultRuntimeLibcallImpls_ppcf128,
            LibmF128Libcalls, AIX32Calls, AIX64Calls,
+           LibmHasSinCosF32, LibmHasSinCosF64, LibmHasSinCosF128,
+           LibmHasSinCosPPCF128,
            AvailableIf<memcpy, isNotAIX>,
            LibcallImpls<(add Int128RTLibcalls), isPPC64>)>;
 
@@ -2000,16 +2070,27 @@ def SPARCSystemLibrary
 >;
 
 //===----------------------------------------------------------------------===//
-// Windows Runtime Libcalls
+// SPIRV Runtime Libcalls
 //===----------------------------------------------------------------------===//
 
-// TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()
+def isSPIRV : RuntimeLibcallPredicate<"TT.isSPIRV()">;
 
+// No calls FIXME: Add memcpy/memset is a hack to skip
+// PreISelIntrinsicLowering in favor of SPIRVPrepareFunctions;
+// probably should remove that and just use the default one.
+def SPIRVSystemLibrary : SystemRuntimeLibrary<isSPIRV, (add memset, memcpy)>;
+
+//===----------------------------------------------------------------------===//
+// Windows Runtime Libcalls
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> WindowsDivRemMulLibcalls = {
 def _alldiv : RuntimeLibcallImpl<SDIV_I64>;
 def _aulldiv : RuntimeLibcallImpl<UDIV_I64>;
 def _allrem : RuntimeLibcallImpl<SREM_I64>;
 def _aullrem : RuntimeLibcallImpl<UREM_I64>;
 def _allmul : RuntimeLibcallImpl<MUL_I64>;
+}
 
 //===----------------------------------------------------------------------===//
 // X86 Runtime Libcalls
@@ -2017,14 +2098,48 @@ def _allmul : RuntimeLibcallImpl<MUL_I64>;
 
 def isX86_32 : RuntimeLibcallPredicate<"TT.getArch() == Triple::x86">;
 def isX86_64 : RuntimeLibcallPredicate<"TT.getArch() == Triple::x86_64">;
+def isX86 : RuntimeLibcallPredicate<"TT.isX86()">;
+
+// Some darwins have an optimized __bzero/bzero function.
+def darwinHas__bzero : RuntimeLibcallPredicate<"TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6)">;
+
+// FIXME: This is has ldexpl/frexpl plus use f128 for long double.
+def hasFrexplLdexplF128
+  : RuntimeLibcallPredicate<[{(!TT.isOSWindows() || TT.isOSCygMing()) && !TT.isGNUEnvironment()}]>;
+
+// Use the f128 variants of math functions on x86
+defvar X86_F128_Libcalls = LibcallImpls<(add LibmF128Libcalls, LibmF128FiniteLibcalls), isGNUEnvironment>;
+
+defvar SinCosF32F64Libcalls = LibcallImpls<(add sincosf, sincos), hasSinCos_f32_f64>;
+
+defvar X86CommonLibcalls =
+  (add DarwinSinCosStret, DarwinExp10,
+       X86_F128_Libcalls,
+       LibmHasSinCosF80, // FIXME: Depends on long double
+       SinCosF32F64Libcalls,
+       LibcallImpls<(add __bzero), darwinHas__bzero>,
+       LibmHasFrexpF32, LibmHasLdexpF32,
+       LibmHasFrexpF80, LibmHasLdexpF80,
+       LibcallImpls<(add frexp_f128, ldexp_f128), hasFrexplLdexplF128>,
+       DefaultRuntimeLibcallImpls_f80,
+       // FIXME: MSVCRT doesn't have powi. The f128 case is added as a
+       // hack for one test relying on it.
+       __powitf2_f128,
+       LibcallImpls<(add MostPowI), isNotOSMSVCRT>
+);
+
+defvar Windows32DivRemMulCalls =
+  LibcallsWithCC<(add WindowsDivRemMulLibcalls), X86_STDCALL,
+  RuntimeLibcallPredicate<"TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()">>;
 
 def X86_32SystemLibrary
     : SystemRuntimeLibrary<isX86_32,
-      (add DefaultLibcallImpls32, DefaultRuntimeLibcallImpls_f80)>;
+      (add X86CommonLibcalls,
+           Windows32DivRemMulCalls, Win32DefaultLibcallImpls)>;
 
 def X86_64SystemLibrary
     : SystemRuntimeLibrary<isX86_64,
-      (add DefaultLibcallImpls64, DefaultRuntimeLibcallImpls_f80)>;
+      (add X86CommonLibcalls, Win64DefaultLibcallImpls, Int128RTLibcalls)>;
 
 //===----------------------------------------------------------------------===//
 // XCore Runtime Libcalls
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index c847716647825..2e231cfba2443 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -257,7 +257,7 @@ LLVM_ABI void initializePostRASchedulerLegacyPass(PassRegistry &);
 LLVM_ABI void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializePrintFunctionPassWrapperPass(PassRegistry &);
 LLVM_ABI void initializePrintModulePassWrapperPass(PassRegistry &);
-LLVM_ABI void initializeProcessImplicitDefsPass(PassRegistry &);
+LLVM_ABI void initializeProcessImplicitDefsLegacyPass(PassRegistry &);
 LLVM_ABI void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializePromoteLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeRABasicPass(PassRegistry &);
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index 6b81bdba25e67..0322cbe6cbe8d 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -19,11 +19,8 @@
 namespace llvm {
 
 class MCAlignFragment;
-class MCDwarfCallFrameFragment;
-class MCDwarfLineAddrFragment;
 class MCFragment;
 class MCLEBFragment;
-class MCRelaxableFragment;
 class MCSymbol;
 class MCAssembler;
 class MCContext;
@@ -157,8 +154,9 @@ class LLVM_ABI MCAsmBackend {
 
   /// Target specific predicate for whether a given fixup requires the
   /// associated instruction to be relaxed.
-  virtual bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &,
-                                            uint64_t, bool Resolved) const;
+  virtual bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+                                            const MCValue &, uint64_t,
+                                            bool Resolved) const;
 
   /// Simple predicate for targets where !Resolved implies requiring relaxation
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
@@ -179,18 +177,16 @@ class LLVM_ABI MCAsmBackend {
   }
 
   // Defined by linker relaxation targets.
-  virtual bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
-                                  bool &WasRelaxed) const {
+  virtual bool relaxDwarfLineAddr(MCFragment &, bool &WasRelaxed) const {
     return false;
   }
-  virtual bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
-                             bool &WasRelaxed) const {
+  virtual bool relaxDwarfCFA(MCFragment &, bool &WasRelaxed) const {
     return false;
   }
 
   // Defined by linker relaxation targets to possibly emit LEB128 relocations
   // and set Value at the relocated location.
-  virtual std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF,
+  virtual std::pair<bool, bool> relaxLEB128(MCFragment &,
                                             int64_t &Value) const {
     return std::make_pair(false, false);
   }
@@ -228,8 +224,7 @@ class LLVM_ABI MCAsmBackend {
 
   bool isDarwinCanonicalPersonality(const MCSymbol *Sym) const;
 
-  // Return STI for fragments of type MCRelaxableFragment and MCDataFragment
-  // with hasInstructions() == true.
+  // Return STI for fragments with hasInstructions() == true.
   static const MCSubtargetInfo *getSubtargetInfo(const MCFragment &F);
 };
 
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 1015992cedf29..ade9ee6fa56e0 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -34,13 +34,10 @@ namespace llvm {
 class MCBoundaryAlignFragment;
 class MCCVDefRangeFragment;
 class MCCVInlineLineTableFragment;
-class MCDwarfCallFrameFragment;
-class MCDwarfLineAddrFragment;
-class MCEncodedFragment;
+class MCFragment;
 class MCFixup;
 class MCLEBFragment;
 class MCPseudoProbeAddrFragment;
-class MCRelaxableFragment;
 class MCSymbolRefExpr;
 class raw_ostream;
 class MCAsmBackend;
@@ -72,6 +69,13 @@ class MCAssembler {
 
   SmallVector<const MCSymbol *, 0> Symbols;
 
+  struct RelocDirective {
+    const MCExpr &Offset;
+    const MCExpr *Expr;
+    uint32_t Kind;
+  };
+  SmallVector<RelocDirective, 0> relocDirectives;
+
   mutable SmallVector<std::pair<SMLoc, std::string>, 0> PendingErrors;
 
   MCDwarfLineTableParams LTParams;
@@ -85,11 +89,6 @@ class MCAssembler {
   // refactoring too.
   mutable SmallPtrSet<const MCSymbol *, 32> ThumbFuncs;
 
-  /// The bundle alignment size currently set in the assembler.
-  ///
-  /// By default it's 0, which means bundling is disabled.
-  unsigned BundleAlignSize = 0;
-
   /// Evaluate a fixup to a relocatable expression and the value which should be
   /// placed into the fixup.
   ///
@@ -107,7 +106,7 @@ class MCAssembler {
 
   /// Check whether a fixup can be satisfied, or whether it needs to be relaxed
   /// (increased in size, in order to hold its value correctly).
-  bool fixupNeedsRelaxation(const MCRelaxableFragment &, const MCFixup &) const;
+  bool fixupNeedsRelaxation(const MCFragment &, const MCFixup &) const;
 
   void layoutSection(MCSection &Sec);
   /// Perform one layout iteration and return the index of the first stable
@@ -116,11 +115,11 @@ class MCAssembler {
 
   /// Perform relaxation on a single fragment.
   bool relaxFragment(MCFragment &F);
-  bool relaxInstruction(MCRelaxableFragment &IF);
-  bool relaxLEB(MCLEBFragment &IF);
+  bool relaxInstruction(MCFragment &F);
+  bool relaxLEB(MCFragment &F);
   bool relaxBoundaryAlign(MCBoundaryAlignFragment &BF);
-  bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF);
-  bool relaxDwarfCallFrameFragment(MCDwarfCallFrameFragment &DF);
+  bool relaxDwarfLineAddr(MCFragment &F);
+  bool relaxDwarfCallFrameFragment(MCFragment &F);
   bool relaxCVInlineLineTable(MCCVInlineLineTableFragment &DF);
   bool relaxCVDefRange(MCCVDefRangeFragment &DF);
   bool relaxFill(MCFillFragment &F);
@@ -143,8 +142,6 @@ class MCAssembler {
   /// Compute the effective fragment size.
   LLVM_ABI uint64_t computeFragmentSize(const MCFragment &F) const;
 
-  LLVM_ABI void layoutBundle(MCFragment *Prev, MCFragment *F) const;
-
   // Get the offset of the given fragment inside its containing section.
   uint64_t getFragmentOffset(const MCFragment &F) const { return F.Offset; }
 
@@ -203,16 +200,6 @@ class MCAssembler {
   bool getRelaxAll() const { return RelaxAll; }
   void setRelaxAll(bool Value) { RelaxAll = Value; }
 
-  bool isBundlingEnabled() const { return BundleAlignSize != 0; }
-
-  unsigned getBundleAlignSize() const { return BundleAlignSize; }
-
-  void setBundleAlignSize(unsigned Size) {
-    assert((Size == 0 || !(Size & (Size - 1))) &&
-           "Expect a power-of-two bundle align size");
-    BundleAlignSize = Size;
-  }
-
   const_iterator begin() const { return Sections.begin(); }
   const_iterator end() const { return Sections.end(); }
 
@@ -225,12 +212,7 @@ class MCAssembler {
 
   LLVM_ABI bool registerSection(MCSection &Section);
   LLVM_ABI bool registerSymbol(const MCSymbol &Symbol);
-
-  /// Write the necessary bundle padding to \p OS.
-  /// Expects a fragment \p F containing instructions and its size \p FSize.
-  LLVM_ABI void writeFragmentPadding(raw_ostream &OS,
-                                     const MCEncodedFragment &F,
-                                     uint64_t FSize) const;
+  void addRelocDirective(RelocDirective RD);
 
   LLVM_ABI void reportError(SMLoc L, const Twine &Msg) const;
   // Record pending errors during layout iteration, as they may go away once the
diff --git a/llvm/include/llvm/MC/MCCodeView.h b/llvm/include/llvm/MC/MCCodeView.h
index 88f84a2462841..9cde44c71baff 100644
--- a/llvm/include/llvm/MC/MCCodeView.h
+++ b/llvm/include/llvm/MC/MCCodeView.h
@@ -26,7 +26,6 @@ namespace llvm {
 class MCAssembler;
 class MCCVDefRangeFragment;
 class MCCVInlineLineTableFragment;
-class MCDataFragment;
 class MCFragment;
 class MCSection;
 class MCSymbol;
@@ -231,7 +230,7 @@ class CodeViewContext {
   StringMap<unsigned> StringTable;
 
   /// The fragment that ultimately holds our strings.
-  MCDataFragment *StrTabFragment = nullptr;
+  MCFragment *StrTabFragment = nullptr;
   SmallVector<char, 0> StrTab = {'\0'};
 
   /// Get a string table offset.
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index 5a8ec17dae1cc..c137f6184a9a7 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -47,7 +47,6 @@ namespace llvm {
 
 class CodeViewContext;
 class MCAsmInfo;
-class MCDataFragment;
 class MCInst;
 class MCLabel;
 class MCObjectFileInfo;
@@ -334,7 +333,7 @@ class MCContext {
   void reportCommon(SMLoc Loc,
                     std::function<void(SMDiagnostic &, const SourceMgr *)>);
 
-  MCDataFragment *allocInitialFragment(MCSection &Sec);
+  MCFragment *allocInitialFragment(MCSection &Sec);
 
   MCSymbolTableEntry &getSymbolTableEntry(StringRef Name);
 
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index fa0b5cbde71d9..144f6bc3bd91c 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -17,7 +17,6 @@ namespace llvm {
 
 class ELFObjectWriter;
 class MCContext;
-class MCDataFragment;
 class MCFragment;
 class MCObjectWriter;
 class MCSection;
@@ -51,7 +50,7 @@ class MCELFStreamer : public MCObjectStreamer {
   void initSections(bool NoExecStack, const MCSubtargetInfo &STI) override;
   void changeSection(MCSection *Section, uint32_t Subsection = 0) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
-  void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCDataFragment &F,
+  void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCFragment &F,
                       uint64_t Offset) override;
   void emitWeakReference(MCSymbol *Alias, const MCSymbol *Target) override;
   bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
@@ -65,13 +64,8 @@ class MCELFStreamer : public MCObjectStreamer {
   void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              Align ByteAlignment) override;
 
-  void emitValueImpl(const MCExpr *Value, unsigned Size,
-                     SMLoc Loc = SMLoc()) override;
-
   void emitIdent(StringRef IdentString) override;
 
-  void emitValueToAlignment(Align, int64_t, uint8_t, unsigned) override;
-
   void emitCGProfileEntry(const MCSymbolRefExpr *From,
                           const MCSymbolRefExpr *To, uint64_t Count) override;
 
@@ -79,10 +73,6 @@ class MCELFStreamer : public MCObjectStreamer {
   // target-specific code.
   void finishImpl() final;
 
-  void emitBundleAlignMode(Align Alignment) override;
-  void emitBundleLock(bool AlignToEnd) override;
-  void emitBundleUnlock() override;
-
   /// ELF object attributes section emission support
   struct AttributeItem {
     // This structure holds all attributes, accounting for their string /
@@ -151,10 +141,8 @@ class MCELFStreamer : public MCObjectStreamer {
   }
 
 private:
-  bool isBundleLocked() const;
-  void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &) override;
-
-  void finalizeCGProfileEntry(const MCSymbolRefExpr *&S, uint64_t Offset);
+  void finalizeCGProfileEntry(const MCSymbolRefExpr *Sym, uint64_t Offset,
+                              const MCSymbolRefExpr *&S);
   void finalizeCGProfile();
 
   bool SeenIdent = false;
diff --git a/llvm/include/llvm/MC/MCFixup.h b/llvm/include/llvm/MC/MCFixup.h
index 4002a862e4d79..aaf75102fb9ed 100644
--- a/llvm/include/llvm/MC/MCFixup.h
+++ b/llvm/include/llvm/MC/MCFixup.h
@@ -94,7 +94,6 @@ class MCFixup {
   }
 
   MCFixupKind getKind() const { return Kind; }
-  unsigned getTargetKind() const { return Kind; }
 
   uint32_t getOffset() const { return Offset; }
   void setOffset(uint32_t Value) { Offset = Value; }
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 2b42507d66920..a55fd4a14675f 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -40,14 +40,6 @@ class MCObjectStreamer : public MCStreamer {
   std::unique_ptr<MCAssembler> Assembler;
   bool EmitEHFrame;
   bool EmitDebugFrame;
-  struct PendingMCFixup {
-    const MCSymbol *Sym;
-    MCFixup Fixup;
-    MCDataFragment *DF;
-    PendingMCFixup(const MCSymbol *McSym, MCDataFragment *F, MCFixup McFixup)
-        : Sym(McSym), Fixup(McFixup), DF(F) {}
-  };
-  SmallVector<PendingMCFixup, 2> PendingFixups;
 
   struct PendingAssignment {
     MCSymbol *Symbol;
@@ -59,11 +51,10 @@ class MCObjectStreamer : public MCStreamer {
   DenseMap<const MCSymbol *, SmallVector<PendingAssignment, 1>>
       pendingAssignments;
 
-  virtual void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &);
+  void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &);
   void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
   void emitInstructionImpl(const MCInst &Inst, const MCSubtargetInfo &STI);
-  void resolvePendingFixups();
 
 protected:
   MCObjectStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
@@ -92,10 +83,10 @@ class MCObjectStreamer : public MCStreamer {
   }
 
   /// Get a data fragment to write into, creating a new one if the current
-  /// fragment is not a data fragment.
+  /// fragment is not FT_Data.
   /// Optionally a \p STI can be passed in so that a new fragment is created
   /// if the Subtarget differs from the current fragment.
-  MCDataFragment *getOrCreateDataFragment(const MCSubtargetInfo* STI = nullptr);
+  MCFragment *getOrCreateDataFragment(const MCSubtargetInfo *STI = nullptr);
 
 protected:
   bool changeSectionImpl(MCSection *Section, uint32_t Subsection);
@@ -109,7 +100,7 @@ class MCObjectStreamer : public MCStreamer {
   /// @{
 
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
-  virtual void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCDataFragment &F,
+  virtual void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCFragment &F,
                               uint64_t Offset);
   void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void emitConditionalAssignment(MCSymbol *Symbol,
@@ -127,9 +118,6 @@ class MCObjectStreamer : public MCStreamer {
   /// can change its size during relaxation.
   void emitInstToFragment(const MCInst &Inst, const MCSubtargetInfo &);
 
-  void emitBundleAlignMode(Align Alignment) override;
-  void emitBundleLock(bool AlignToEnd) override;
-  void emitBundleUnlock() override;
   void emitBytes(StringRef Data) override;
   void emitValueToAlignment(Align Alignment, int64_t Fill = 0,
                             uint8_t FillLen = 1,
@@ -165,9 +153,8 @@ class MCObjectStreamer : public MCStreamer {
   void emitCVStringTableDirective() override;
   void emitCVFileChecksumsDirective() override;
   void emitCVFileChecksumOffsetDirective(unsigned FileNo) override;
-  std::optional<std::pair<bool, std::string>>
-  emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr,
-                     SMLoc Loc, const MCSubtargetInfo &STI) override;
+  void emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                          const MCExpr *Expr, SMLoc Loc = {}) override;
   using MCStreamer::emitFill;
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                 SMLoc Loc = SMLoc()) override;
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 64b13972bfca1..296fdd8af0d14 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -46,8 +46,6 @@ class LLVM_ABI MCSection {
   friend MCAssembler;
   friend MCObjectStreamer;
   friend class MCFragment;
-  friend class MCEncodedFragment;
-  friend class MCRelaxableFragment;
   static constexpr unsigned NonUniqueID = ~0U;
 
   enum SectionVariant {
@@ -61,13 +59,6 @@ class LLVM_ABI MCSection {
     SV_DXContainer,
   };
 
-  /// Express the state of bundle locked groups while emitting code.
-  enum BundleLockStateType {
-    NotBundleLocked,
-    BundleLocked,
-    BundleLockedAlignToEnd
-  };
-
   struct iterator {
     MCFragment *F = nullptr;
     iterator() = default;
@@ -94,16 +85,6 @@ class LLVM_ABI MCSection {
   /// The section index in the assemblers section list.
   unsigned Ordinal = 0;
 
-  /// Keeping track of bundle-locked state.
-  BundleLockStateType BundleLockState = NotBundleLocked;
-
-  /// Current nesting depth of bundle_lock directives.
-  unsigned BundleLockNestingDepth = 0;
-
-  /// We've seen a bundle_lock directive but not its first instruction
-  /// yet.
-  bool BundleGroupBeforeFirstInst : 1;
-
   /// Whether this section has had instructions emitted into it.
   bool HasInstructions : 1;
 
@@ -169,17 +150,6 @@ class LLVM_ABI MCSection {
   unsigned getOrdinal() const { return Ordinal; }
   void setOrdinal(unsigned Value) { Ordinal = Value; }
 
-  BundleLockStateType getBundleLockState() const { return BundleLockState; }
-  void setBundleLockState(BundleLockStateType NewState);
-  bool isBundleLocked() const { return BundleLockState != NotBundleLocked; }
-
-  bool isBundleGroupBeforeFirstInst() const {
-    return BundleGroupBeforeFirstInst;
-  }
-  void setBundleGroupBeforeFirstInst(bool IsFirst) {
-    BundleGroupBeforeFirstInst = IsFirst;
-  }
-
   bool hasInstructions() const { return HasInstructions; }
   void setHasInstructions(bool Value) { HasInstructions = Value; }
 
@@ -255,65 +225,59 @@ class MCFragment {
   FragmentType Kind;
 
 protected:
-  /// Used by subclasses for better packing.
+  bool LinkerRelaxable : 1;
+
+  /// Used by certain fragment types for better packing.
   ///
-  /// MCEncodedFragment
+  /// FT_Data, FT_Relaxable
   bool HasInstructions : 1;
-  bool AlignToBundleEnd : 1;
-  /// MCDataFragment
-  bool LinkerRelaxable : 1;
-  /// MCRelaxableFragment: x86-specific
+  /// FT_Relaxable, x86-specific
   bool AllowAutoPadding : 1;
 
-  LLVM_ABI MCFragment(FragmentType Kind, bool HasInstructions);
-
-public:
-  MCFragment() = delete;
-  MCFragment(const MCFragment &) = delete;
-  MCFragment &operator=(const MCFragment &) = delete;
-
-  MCFragment *getNext() const { return Next; }
-
-  FragmentType getKind() const { return Kind; }
-
-  MCSection *getParent() const { return Parent; }
-  void setParent(MCSection *Value) { Parent = Value; }
-
-  LLVM_ABI const MCSymbol *getAtom() const;
-
-  unsigned getLayoutOrder() const { return LayoutOrder; }
-  void setLayoutOrder(unsigned Value) { LayoutOrder = Value; }
-
-  /// Does this fragment have instructions emitted into it? By default
-  /// this is false, but specific fragment types may set it to true.
-  bool hasInstructions() const { return HasInstructions; }
-
-  bool isLinkerRelaxable() const { return LinkerRelaxable; }
-  void setLinkerRelaxable() { LinkerRelaxable = true; }
-
-  LLVM_ABI void dump() const;
-};
-
-/// Interface implemented by fragments that contain encoded instructions and/or
-/// data.
-class MCEncodedFragment : public MCFragment {
-  uint8_t BundlePadding = 0;
   uint32_t ContentStart = 0;
   uint32_t ContentEnd = 0;
   uint32_t FixupStart = 0;
   uint32_t FixupEnd = 0;
 
-protected:
-  MCEncodedFragment(MCFragment::FragmentType FType, bool HasInstructions)
-      : MCFragment(FType, HasInstructions) {}
+  uint32_t VarContentStart = 0;
+  uint32_t VarContentEnd = 0;
+  uint32_t VarFixupStart = 0;
+  uint32_t VarFixupEnd = 0;
 
-  /// The MCSubtargetInfo in effect when the instruction was encoded.
-  /// It must be non-null for instructions.
   const MCSubtargetInfo *STI = nullptr;
 
+  // Optional variable-size tail used by various fragment types.
+  union Tail {
+    struct {
+      uint32_t Opcode;
+      uint32_t Flags;
+      uint32_t OperandStart;
+      uint32_t OperandSize;
+    } relax;
+    struct {
+      // True if this is a sleb128, false if uleb128.
+      bool IsSigned;
+      // The value this fragment should contain.
+      const MCExpr *Value;
+    } leb;
+    // Used by .debug_frame and .debug_line to encode an address difference.
+    struct {
+      // The address difference between two labels.
+      const MCExpr *AddrDelta;
+      // The value of the difference between the two line numbers between two
+      // .loc dwarf directives.
+      int64_t LineDelta;
+    } dwarf;
+  } u{};
+
 public:
-  static bool classof(const MCFragment *F) {
-    MCFragment::FragmentType Kind = F->getKind();
+  LLVM_ABI MCFragment(FragmentType Kind = MCFragment::FT_Data,
+                      bool HasInstructions = false);
+  MCFragment(const MCFragment &) = delete;
+  MCFragment &operator=(const MCFragment &) = delete;
+
+  bool isEncoded() const {
+    MCFragment::FragmentType Kind = getKind();
     switch (Kind) {
     default:
       return false;
@@ -329,20 +293,23 @@ class MCEncodedFragment : public MCFragment {
     }
   }
 
-  /// Should this fragment be placed at the end of an aligned bundle?
-  bool alignToBundleEnd() const { return AlignToBundleEnd; }
-  void setAlignToBundleEnd(bool V) { AlignToBundleEnd = V; }
+  MCFragment *getNext() const { return Next; }
 
-  /// Get the padding size that must be inserted before this fragment.
-  /// Used for bundling. By default, no padding is inserted.
-  /// Note that padding size is restricted to 8 bits. This is an optimization
-  /// to reduce the amount of space used for each fragment. In practice, larger
-  /// padding should never be required.
-  uint8_t getBundlePadding() const { return BundlePadding; }
+  FragmentType getKind() const { return Kind; }
 
-  /// Set the padding size for this fragment. By default it's a no-op,
-  /// and only some fragments have a meaningful implementation.
-  void setBundlePadding(uint8_t N) { BundlePadding = N; }
+  MCSection *getParent() const { return Parent; }
+  void setParent(MCSection *Value) { Parent = Value; }
+
+  LLVM_ABI const MCSymbol *getAtom() const;
+
+  unsigned getLayoutOrder() const { return LayoutOrder; }
+  void setLayoutOrder(unsigned Value) { LayoutOrder = Value; }
+
+  /// Does this fragment have instructions emitted into it? By default
+  /// this is false, but specific fragment types may set it to true.
+  bool hasInstructions() const { return HasInstructions; }
+
+  LLVM_ABI void dump() const;
 
   /// Retrieve the MCSubTargetInfo in effect when the instruction was encoded.
   /// Guaranteed to be non-null if hasInstructions() == true
@@ -355,6 +322,9 @@ class MCEncodedFragment : public MCFragment {
     this->STI = &STI;
   }
 
+  bool isLinkerRelaxable() const { return LinkerRelaxable; }
+  void setLinkerRelaxable() { LinkerRelaxable = true; }
+
   bool getAllowAutoPadding() const { return AllowAutoPadding; }
   void setAllowAutoPadding(bool V) { AllowAutoPadding = V; }
 
@@ -394,7 +364,24 @@ class MCEncodedFragment : public MCFragment {
         .slice(ContentStart, ContentEnd - ContentStart);
   }
 
-  // Fixup-related functions manage parent's storage using FixupStart and
+  void setVarContents(ArrayRef<char> Contents);
+  void clearVarContents() { setVarContents({}); }
+  MutableArrayRef<char> getVarContents() {
+    return MutableArrayRef(getParent()->ContentStorage)
+        .slice(VarContentStart, VarContentEnd - VarContentStart);
+  }
+  ArrayRef<char> getVarContents() const {
+    return ArrayRef(getParent()->ContentStorage)
+        .slice(VarContentStart, VarContentEnd - VarContentStart);
+  }
+
+  size_t getFixedSize() const { return ContentEnd - ContentStart; }
+  size_t getVarSize() const { return VarContentEnd - VarContentStart; }
+  size_t getSize() const {
+    return ContentEnd - ContentStart + (VarContentEnd - VarContentStart);
+  }
+
+  //== Fixup-related functions manage parent's storage using FixupStart and
   // FixupSize.
   void clearFixups() { FixupEnd = FixupStart; }
   LLVM_ABI void addFixup(MCFixup Fixup);
@@ -409,65 +396,96 @@ class MCEncodedFragment : public MCFragment {
         .slice(FixupStart, FixupEnd - FixupStart);
   }
 
-  size_t getSize() const { return ContentEnd - ContentStart; }
-};
-
-/// Fragment for data and encoded instructions.
-///
-class MCDataFragment : public MCEncodedFragment {
-public:
-  MCDataFragment() : MCEncodedFragment(FT_Data, false) {}
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_Data;
+  // Source fixup offsets are relative to the variable part's start.
+  // Stored fixup offsets are relative to the fixed part's start.
+  void setVarFixups(ArrayRef<MCFixup> Fixups);
+  void clearVarFixups() { setVarFixups({}); }
+  MutableArrayRef<MCFixup> getVarFixups() {
+    return MutableArrayRef(getParent()->FixupStorage)
+        .slice(VarFixupStart, VarFixupEnd - VarFixupStart);
   }
-};
-
-/// A relaxable fragment holds on to its MCInst, since it may need to be
-/// relaxed during the assembler layout and relaxation stage.
-///
-class MCRelaxableFragment : public MCEncodedFragment {
-  uint32_t Opcode = 0;
-  uint32_t Flags = 0;
-  uint32_t OperandStart = 0;
-  uint32_t OperandSize = 0;
-
-public:
-  MCRelaxableFragment(const MCSubtargetInfo &STI)
-      : MCEncodedFragment(FT_Relaxable, true) {
-    this->STI = &STI;
+  ArrayRef<MCFixup> getVarFixups() const {
+    return ArrayRef(getParent()->FixupStorage)
+        .slice(VarFixupStart, VarFixupEnd - VarFixupStart);
   }
 
-  unsigned getOpcode() const { return Opcode; }
+  //== FT_Relaxable functions
+  unsigned getOpcode() const {
+    assert(Kind == FT_Relaxable);
+    return u.relax.Opcode;
+  }
   ArrayRef<MCOperand> getOperands() const {
+    assert(Kind == FT_Relaxable);
     return MutableArrayRef(getParent()->MCOperandStorage)
-        .slice(OperandStart, OperandSize);
+        .slice(u.relax.OperandStart, u.relax.OperandSize);
   }
   MCInst getInst() const {
+    assert(Kind == FT_Relaxable);
     MCInst Inst;
-    Inst.setOpcode(Opcode);
-    Inst.setFlags(Flags);
+    Inst.setOpcode(u.relax.Opcode);
+    Inst.setFlags(u.relax.Flags);
     Inst.setOperands(ArrayRef(getParent()->MCOperandStorage)
-                         .slice(OperandStart, OperandSize));
+                         .slice(u.relax.OperandStart, u.relax.OperandSize));
     return Inst;
   }
   void setInst(const MCInst &Inst) {
-    Opcode = Inst.getOpcode();
-    Flags = Inst.getFlags();
+    assert(Kind == FT_Relaxable);
+    u.relax.Opcode = Inst.getOpcode();
+    u.relax.Flags = Inst.getFlags();
     auto &S = getParent()->MCOperandStorage;
-    if (Inst.getNumOperands() > OperandSize) {
-      OperandStart = S.size();
+    if (Inst.getNumOperands() > u.relax.OperandSize) {
+      u.relax.OperandStart = S.size();
       S.resize_for_overwrite(S.size() + Inst.getNumOperands());
     }
-    OperandSize = Inst.getNumOperands();
-    llvm::copy(Inst, S.begin() + OperandStart);
+    u.relax.OperandSize = Inst.getNumOperands();
+    llvm::copy(Inst, S.begin() + u.relax.OperandStart);
   }
 
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_Relaxable;
+  //== FT_LEB functions
+  const MCExpr &getLEBValue() const {
+    assert(Kind == FT_LEB);
+    return *u.leb.Value;
+  }
+  void setLEBValue(const MCExpr *Expr) {
+    assert(Kind == FT_LEB);
+    u.leb.Value = Expr;
+  }
+  bool isLEBSigned() const {
+    assert(Kind == FT_LEB);
+    return u.leb.IsSigned;
+  }
+  void setLEBSigned(bool S) {
+    assert(Kind == FT_LEB);
+    u.leb.IsSigned = S;
+  }
+
+  //== FT_DwarfFrame functions
+  const MCExpr &getDwarfAddrDelta() const {
+    assert(Kind == FT_Dwarf || Kind == FT_DwarfFrame);
+    return *u.dwarf.AddrDelta;
+  }
+  void setDwarfAddrDelta(const MCExpr *E) {
+    assert(Kind == FT_Dwarf || Kind == FT_DwarfFrame);
+    u.dwarf.AddrDelta = E;
+  }
+  int64_t getDwarfLineDelta() const {
+    assert(Kind == FT_Dwarf);
+    return u.dwarf.LineDelta;
+  }
+  void setDwarfLineDelta(int64_t LineDelta) {
+    assert(Kind == FT_Dwarf);
+    u.dwarf.LineDelta = LineDelta;
   }
 };
 
+/// Interface implemented by fragments that contain encoded instructions and/or
+/// data.
+class MCEncodedFragment : public MCFragment {
+protected:
+  MCEncodedFragment(MCFragment::FragmentType FType, bool HasInstructions)
+      : MCFragment(FType, HasInstructions) {}
+};
+
 class MCAlignFragment : public MCFragment {
   /// Flag to indicate that (optimal) NOPs should be emitted instead
   /// of using the provided value. The exact interpretation of this flag is
@@ -599,67 +617,6 @@ class MCOrgFragment : public MCFragment {
   }
 };
 
-class MCLEBFragment final : public MCEncodedFragment {
-  /// True if this is a sleb128, false if uleb128.
-  bool IsSigned;
-
-  /// The value this fragment should contain.
-  const MCExpr *Value;
-
-public:
-  MCLEBFragment(const MCExpr &Value, bool IsSigned)
-      : MCEncodedFragment(FT_LEB, false), IsSigned(IsSigned), Value(&Value) {}
-
-  const MCExpr &getValue() const { return *Value; }
-  void setValue(const MCExpr *Expr) { Value = Expr; }
-
-  bool isSigned() const { return IsSigned; }
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_LEB;
-  }
-};
-
-class MCDwarfLineAddrFragment : public MCEncodedFragment {
-  /// The value of the difference between the two line numbers
-  /// between two .loc dwarf directives.
-  int64_t LineDelta;
-
-  /// The expression for the difference of the two symbols that
-  /// make up the address delta between two .loc dwarf directives.
-  const MCExpr *AddrDelta;
-
-public:
-  MCDwarfLineAddrFragment(int64_t LineDelta, const MCExpr &AddrDelta)
-      : MCEncodedFragment(FT_Dwarf, false), LineDelta(LineDelta),
-        AddrDelta(&AddrDelta) {}
-
-  int64_t getLineDelta() const { return LineDelta; }
-
-  const MCExpr &getAddrDelta() const { return *AddrDelta; }
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_Dwarf;
-  }
-};
-
-class MCDwarfCallFrameFragment : public MCEncodedFragment {
-  /// The expression for the difference of the two symbols that
-  /// make up the address delta between two .cfi_* dwarf directives.
-  const MCExpr *AddrDelta;
-
-public:
-  MCDwarfCallFrameFragment(const MCExpr &AddrDelta)
-      : MCEncodedFragment(FT_DwarfFrame, false), AddrDelta(&AddrDelta) {}
-
-  const MCExpr &getAddrDelta() const { return *AddrDelta; }
-  void setAddrDelta(const MCExpr *E) { AddrDelta = E; }
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_DwarfFrame;
-  }
-};
-
 /// Represents a symbol table index fragment.
 class MCSymbolIdFragment : public MCFragment {
   const MCSymbol *Sym;
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 4e052ecfe0e86..b3a9aabd6ece5 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -1048,13 +1048,9 @@ class LLVM_ABI MCStreamer {
 
   virtual void emitSyntaxDirective();
 
-  /// Record a relocation described by the .reloc directive. Return std::nullopt
-  /// if succeeded. Otherwise, return a pair (Name is invalid, error message).
-  virtual std::optional<std::pair<bool, std::string>>
-  emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr,
-                     SMLoc Loc, const MCSubtargetInfo &STI) {
-    return std::nullopt;
-  }
+  /// Record a relocation described by the .reloc directive.
+  virtual void emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                  const MCExpr *Expr, SMLoc Loc = {}) {}
 
   virtual void emitAddrsig() {}
   virtual void emitAddrsigSym(const MCSymbol *Sym) {}
@@ -1068,19 +1064,6 @@ class LLVM_ABI MCStreamer {
                                const MCPseudoProbeInlineStack &InlineStack,
                                MCSymbol *FnSym);
 
-  /// Set the bundle alignment mode from now on in the section.
-  /// The value 1 means turn the bundle alignment off.
-  virtual void emitBundleAlignMode(Align Alignment);
-
-  /// The following instructions are a bundle-locked group.
-  ///
-  /// \param AlignToEnd - If true, the bundle-locked group will be aligned to
-  ///                     the end of a bundle.
-  virtual void emitBundleLock(bool AlignToEnd);
-
-  /// Ends a bundle-locked group.
-  virtual void emitBundleUnlock();
-
   /// If this file is backed by a assembly streamer, this dumps the
   /// specified string in the output .s file.  This capability is indicated by
   /// the hasRawTextSupport() predicate.  By default this aborts.
diff --git a/llvm/include/llvm/MC/MCWasmStreamer.h b/llvm/include/llvm/MC/MCWasmStreamer.h
index 2598c261ea02a..e8a71975c5d62 100644
--- a/llvm/include/llvm/MC/MCWasmStreamer.h
+++ b/llvm/include/llvm/MC/MCWasmStreamer.h
@@ -42,7 +42,7 @@ class MCWasmStreamer : public MCObjectStreamer {
 
   void changeSection(MCSection *Section, uint32_t Subsection) override;
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
-  void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCDataFragment &F,
+  void emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc, MCFragment &F,
                       uint64_t Offset) override;
   bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 103686884e705..a3aa0d9c137a2 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -1312,7 +1312,7 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
     case ELF::EM_PPC:
       return (IsLittleEndian ? "elf32-powerpcle" : "elf32-powerpc");
     case ELF::EM_RISCV:
-      return "elf32-littleriscv";
+      return (IsLittleEndian ? "elf32-littleriscv" : "elf32-bigriscv");
     case ELF::EM_CSKY:
       return "elf32-csky";
     case ELF::EM_SPARC:
@@ -1338,7 +1338,7 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
     case ELF::EM_PPC64:
       return (IsLittleEndian ? "elf64-powerpcle" : "elf64-powerpc");
     case ELF::EM_RISCV:
-      return "elf64-littleriscv";
+      return (IsLittleEndian ? "elf64-littleriscv" : "elf64-bigriscv");
     case ELF::EM_S390:
       return "elf64-s390";
     case ELF::EM_SPARCV9:
@@ -1400,9 +1400,9 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
   case ELF::EM_RISCV:
     switch (EF.getHeader().e_ident[ELF::EI_CLASS]) {
     case ELF::ELFCLASS32:
-      return Triple::riscv32;
+      return IsLittleEndian ? Triple::riscv32 : Triple::riscv32be;
     case ELF::ELFCLASS64:
-      return Triple::riscv64;
+      return IsLittleEndian ? Triple::riscv64 : Triple::riscv64be;
     default:
       report_fatal_error("Invalid ELFCLASS!");
     }
diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h
index 2ecd47dd10bde..f3962c3556c95 100644
--- a/llvm/include/llvm/Pass.h
+++ b/llvm/include/llvm/Pass.h
@@ -114,6 +114,10 @@ class LLVM_ABI Pass {
   /// Registration templates, but can be overloaded directly.
   virtual StringRef getPassName() const;
 
+  /// Return a nice clean name for a pass
+  /// corresponding to that used to enable the pass in opt.
+  StringRef getPassArgument() const;
+
   /// getPassID - Return the PassID number that corresponds to this pass.
   AnalysisID getPassID() const {
     return PassID;
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 910e25a048815..b0360f1903c0e 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -72,6 +72,7 @@
 #include "llvm/CodeGen/PostRAMachineSink.h"
 #include "llvm/CodeGen/PostRASchedulerList.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
+#include "llvm/CodeGen/ProcessImplicitDefs.h"
 #include "llvm/CodeGen/RegAllocEvictionAdvisor.h"
 #include "llvm/CodeGen/RegAllocFast.h"
 #include "llvm/CodeGen/RegAllocGreedyPass.h"
@@ -171,8 +172,12 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
 
     // Target should override TM.Options.EnableIPRA in their target-specific
     // LLVMTM ctor. See TargetMachine::setGlobalISel for example.
-    if (Opt.EnableIPRA)
+    if (Opt.EnableIPRA) {
       TM.Options.EnableIPRA = *Opt.EnableIPRA;
+    } else {
+      // If not explicitly specified, use target default.
+      TM.Options.EnableIPRA |= TM.useIPRA();
+    }
 
     if (Opt.EnableGlobalISelAbort)
       TM.Options.GlobalISelAbort = *Opt.EnableGlobalISelAbort;
@@ -276,7 +281,7 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
 
       FunctionPassManager FPM;
       FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM)));
-      FPM.addPass(InvalidateAnalysisPass<MachineFunctionAnalysis>());
+      FPM.addPass(FreeMachineFunctionPass());
       if (this->PB.AddInCGSCCOrder) {
         MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
             createCGSCCToFunctionPassAdaptor(std::move(FPM))));
@@ -574,8 +579,10 @@ template <typename DerivedT, typename TargetMachineT> class CodeGenPassBuilder {
   void insertPass(InsertedPassT &&Pass) const {
     AfterCallbacks.emplace_back(
         [&](StringRef Name, MachineFunctionPassManager &MFPM) mutable {
-          if (Name == TargetPassT::name())
+          if (Name == TargetPassT::name() &&
+              runBeforeAdding(InsertedPassT::name())) {
             MFPM.addPass(std::forward<InsertedPassT>(Pass));
+          }
         });
   }
 
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index b6c1460313f86..732fdc7c9bc1f 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -42,41 +42,6 @@ FUNCTION_ANALYSIS("ssp-layout", SSPLayoutAnalysis())
 FUNCTION_ANALYSIS("target-ir", TargetIRAnalysis(std::move(TM.getTargetIRAnalysis())))
 #undef FUNCTION_ANALYSIS
 
-#ifndef FUNCTION_PASS
-#define FUNCTION_PASS(NAME, CREATE_PASS)
-#endif
-FUNCTION_PASS("callbr-prepare", CallBrPreparePass())
-FUNCTION_PASS("cfguard", CFGuardPass())
-FUNCTION_PASS("codegenprepare", CodeGenPreparePass(TM))
-FUNCTION_PASS("consthoist", ConstantHoistingPass())
-FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
-FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(false))
-FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
-FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
-FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
-FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
-FUNCTION_PASS("gc-lowering", GCLoweringPass())
-FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(TM))
-FUNCTION_PASS("interleaved-access", InterleavedAccessPass(TM))
-FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(TM))
-FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
-FUNCTION_PASS("lower-invoke", LowerInvokePass())
-FUNCTION_PASS("mergeicmps", MergeICmpsPass())
-FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
-FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(true))
-FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib())
-FUNCTION_PASS("safe-stack", SafeStackPass(TM))
-FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass())
-FUNCTION_PASS("select-optimize", SelectOptimizePass(TM))
-FUNCTION_PASS("sjlj-eh-prepare", SjLjEHPreparePass(TM))
-FUNCTION_PASS("stack-protector", StackProtectorPass(TM))
-FUNCTION_PASS("tlshoist", TLSVariableHoistPass())
-FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
-FUNCTION_PASS("verify", VerifierPass())
-FUNCTION_PASS("wasm-eh-prepare", WasmEHPreparePass())
-FUNCTION_PASS("win-eh-prepare", WinEHPreparePass())
-#undef FUNCTION_PASS
-
 #ifndef LOOP_PASS
 #define LOOP_PASS(NAME, CREATE_PASS)
 #endif
@@ -187,6 +152,7 @@ MACHINE_FUNCTION_PASS("print<machine-uniformity>",
                       MachineUniformityPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<slot-indexes>", SlotIndexesPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<virtregmap>", VirtRegMapPrinterPass(errs()))
+MACHINE_FUNCTION_PASS("process-imp-defs", ProcessImplicitDefsPass())
 MACHINE_FUNCTION_PASS("prolog-epilog", PrologEpilogInserterPass())
 MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass())
 MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass())
@@ -318,7 +284,6 @@ DUMMY_MACHINE_FUNCTION_PASS("static-data-splitter", StaticDataSplitter)
 DUMMY_MACHINE_FUNCTION_PASS("machine-function-splitter", MachineFunctionSplitterPass)
 DUMMY_MACHINE_FUNCTION_PASS("machineinstr-printer", MachineFunctionPrinterPass)
 DUMMY_MACHINE_FUNCTION_PASS("mirfs-discriminators", MIRAddFSDiscriminatorsPass)
-DUMMY_MACHINE_FUNCTION_PASS("processimpdefs", ProcessImplicitDefsPass)
 DUMMY_MACHINE_FUNCTION_PASS("prologepilog-code", PrologEpilogCodeInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("ra-basic", RABasicPass)
 DUMMY_MACHINE_FUNCTION_PASS("ra-pbqp", RAPBQPPass)
diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h
index 1550515ce3d7b..154ded8e94d7f 100644
--- a/llvm/include/llvm/TableGen/StringToOffsetTable.h
+++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h
@@ -23,10 +23,15 @@ namespace llvm {
 class StringToOffsetTable {
   StringMap<unsigned> StringOffset;
   std::string AggregateString;
+
+  /// If this is to be a static class member, the prefix to use (i.e. class name
+  /// plus ::)
+  const StringRef ClassPrefix;
   const bool AppendZero;
 
 public:
-  StringToOffsetTable(bool AppendZero = true) : AppendZero(AppendZero) {
+  StringToOffsetTable(bool AppendZero = true, StringRef ClassPrefix = "")
+      : ClassPrefix(ClassPrefix), AppendZero(AppendZero) {
     // Ensure we always put the empty string at offset zero. That lets empty
     // initialization also be zero initialization for offsets into the table.
     GetOrAddStringOffset("");
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 66051d756c808..fc81ab76dc72d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,14 +1132,14 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
 def udiv_by_const : GICombineRule<
   (defs root:$root),
   (match (G_UDIV $dst, $x, $y):$root,
-   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
 
 def sdiv_by_const : GICombineRule<
   (defs root:$root),
   (match (G_SDIV $dst, $x, $y):$root,
-   [{ return Helper.matchSDivByConst(*${root}); }]),
-  (apply [{ Helper.applySDivByConst(*${root}); }])>;
+   [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+  (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
 
 def sdiv_by_pow2 : GICombineRule<
   (defs root:$root),
@@ -1159,10 +1159,16 @@ def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2,
 def urem_by_const : GICombineRule<
   (defs root:$root),
   (match (G_UREM $dst, $x, $y):$root,
-   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
 
-def intrem_combines : GICombineGroup<[urem_by_const]>;
+def srem_by_const : GICombineRule<
+  (defs root:$root),
+  (match (G_SREM $dst, $x, $y):$root,
+   [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+  (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>;
 
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index 27a688bc12abf..397239b1685fb 100644
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -93,6 +93,9 @@ class LLVM_ABI TargetLoweringObjectFile : public MCObjectFileInfo {
   /// Emit Call Graph Profile metadata.
   void emitCGProfileMetadata(MCStreamer &Streamer, Module &M) const;
 
+  /// Emit pseudo_probe_desc metadata.
+  void emitPseudoProbeDescMetadata(MCStreamer &Streamer, Module &M) const;
+
   /// Process linker options metadata and emit platform-specific bits.
   virtual void emitLinkerDirectives(MCStreamer &Streamer, Module &M) const {}
 
diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h
index be3d41e022ad9..40a9b6cc13902 100644
--- a/llvm/include/llvm/TargetParser/Host.h
+++ b/llvm/include/llvm/TargetParser/Host.h
@@ -53,7 +53,7 @@ LLVM_ABI StringRef getHostCPUName();
 /// which features may appear in this map, except that they are all valid LLVM
 /// feature names. The map can be empty, for example if feature detection
 /// fails.
-LLVM_ABI const StringMap<bool, MallocAllocator> getHostCPUFeatures();
+LLVM_ABI StringMap<bool, MallocAllocator> getHostCPUFeatures();
 
 /// This is a function compatible with cl::AddExtraVersionPrinter, which adds
 /// info about the current target triple and detected CPU.
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 657f4230379e8..670a6321fdc02 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -49,62 +49,64 @@ class Triple {
   enum ArchType {
     UnknownArch,
 
-    arm,            // ARM (little endian): arm, armv.*, xscale
-    armeb,          // ARM (big endian): armeb
-    aarch64,        // AArch64 (little endian): aarch64
-    aarch64_be,     // AArch64 (big endian): aarch64_be
-    aarch64_32,     // AArch64 (little endian) ILP32: aarch64_32
-    arc,            // ARC: Synopsys ARC
-    avr,            // AVR: Atmel AVR microcontroller
-    bpfel,          // eBPF or extended BPF or 64-bit BPF (little endian)
-    bpfeb,          // eBPF or extended BPF or 64-bit BPF (big endian)
-    csky,           // CSKY: csky
-    dxil,           // DXIL 32-bit DirectX bytecode
-    hexagon,        // Hexagon: hexagon
-    loongarch32,    // LoongArch (32-bit): loongarch32
-    loongarch64,    // LoongArch (64-bit): loongarch64
-    m68k,           // M68k: Motorola 680x0 family
-    mips,           // MIPS: mips, mipsallegrex, mipsr6
-    mipsel,         // MIPSEL: mipsel, mipsallegrexe, mipsr6el
-    mips64,         // MIPS64: mips64, mips64r6, mipsn32, mipsn32r6
-    mips64el,       // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el
-    msp430,         // MSP430: msp430
-    ppc,            // PPC: powerpc
-    ppcle,          // PPCLE: powerpc (little endian)
-    ppc64,          // PPC64: powerpc64, ppu
-    ppc64le,        // PPC64LE: powerpc64le
-    r600,           // R600: AMD GPUs HD2XXX - HD6XXX
-    amdgcn,         // AMDGCN: AMD GCN GPUs
-    riscv32,        // RISC-V (32-bit): riscv32
-    riscv64,        // RISC-V (64-bit): riscv64
-    sparc,          // Sparc: sparc
-    sparcv9,        // Sparcv9: Sparcv9
-    sparcel,        // Sparc: (endianness = little). NB: 'Sparcle' is a CPU variant
-    systemz,        // SystemZ: s390x
-    tce,            // TCE (http://tce.cs.tut.fi/): tce
-    tcele,          // TCE little endian (http://tce.cs.tut.fi/): tcele
-    thumb,          // Thumb (little endian): thumb, thumbv.*
-    thumbeb,        // Thumb (big endian): thumbeb
-    x86,            // X86: i[3-9]86
-    x86_64,         // X86-64: amd64, x86_64
-    xcore,          // XCore: xcore
-    xtensa,         // Tensilica: Xtensa
-    nvptx,          // NVPTX: 32-bit
-    nvptx64,        // NVPTX: 64-bit
-    amdil,          // AMDIL
-    amdil64,        // AMDIL with 64-bit pointers
-    hsail,          // AMD HSAIL
-    hsail64,        // AMD HSAIL with 64-bit pointers
-    spir,           // SPIR: standard portable IR for OpenCL 32-bit version
-    spir64,         // SPIR: standard portable IR for OpenCL 64-bit version
-    spirv,          // SPIR-V with logical memory layout.
-    spirv32,        // SPIR-V with 32-bit pointers
-    spirv64,        // SPIR-V with 64-bit pointers
-    kalimba,        // Kalimba: generic kalimba
-    shave,          // SHAVE: Movidius vector VLIW processors
-    lanai,          // Lanai: Lanai 32-bit
-    wasm32,         // WebAssembly with 32-bit pointers
-    wasm64,         // WebAssembly with 64-bit pointers
+    arm,         // ARM (little endian): arm, armv.*, xscale
+    armeb,       // ARM (big endian): armeb
+    aarch64,     // AArch64 (little endian): aarch64
+    aarch64_be,  // AArch64 (big endian): aarch64_be
+    aarch64_32,  // AArch64 (little endian) ILP32: aarch64_32
+    arc,         // ARC: Synopsys ARC
+    avr,         // AVR: Atmel AVR microcontroller
+    bpfel,       // eBPF or extended BPF or 64-bit BPF (little endian)
+    bpfeb,       // eBPF or extended BPF or 64-bit BPF (big endian)
+    csky,        // CSKY: csky
+    dxil,        // DXIL 32-bit DirectX bytecode
+    hexagon,     // Hexagon: hexagon
+    loongarch32, // LoongArch (32-bit): loongarch32
+    loongarch64, // LoongArch (64-bit): loongarch64
+    m68k,        // M68k: Motorola 680x0 family
+    mips,        // MIPS: mips, mipsallegrex, mipsr6
+    mipsel,      // MIPSEL: mipsel, mipsallegrexe, mipsr6el
+    mips64,      // MIPS64: mips64, mips64r6, mipsn32, mipsn32r6
+    mips64el,    // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el
+    msp430,      // MSP430: msp430
+    ppc,         // PPC: powerpc
+    ppcle,       // PPCLE: powerpc (little endian)
+    ppc64,       // PPC64: powerpc64, ppu
+    ppc64le,     // PPC64LE: powerpc64le
+    r600,        // R600: AMD GPUs HD2XXX - HD6XXX
+    amdgcn,      // AMDGCN: AMD GCN GPUs
+    riscv32,     // RISC-V (32-bit, little endian): riscv32
+    riscv64,     // RISC-V (64-bit, little endian): riscv64
+    riscv32be,   // RISC-V (32-bit, big endian): riscv32be
+    riscv64be,   // RISC-V (64-bit, big endian): riscv64be
+    sparc,       // Sparc: sparc
+    sparcv9,     // Sparcv9: Sparcv9
+    sparcel,     // Sparc: (endianness = little). NB: 'Sparcle' is a CPU variant
+    systemz,     // SystemZ: s390x
+    tce,         // TCE (http://tce.cs.tut.fi/): tce
+    tcele,       // TCE little endian (http://tce.cs.tut.fi/): tcele
+    thumb,       // Thumb (little endian): thumb, thumbv.*
+    thumbeb,     // Thumb (big endian): thumbeb
+    x86,         // X86: i[3-9]86
+    x86_64,      // X86-64: amd64, x86_64
+    xcore,       // XCore: xcore
+    xtensa,      // Tensilica: Xtensa
+    nvptx,       // NVPTX: 32-bit
+    nvptx64,     // NVPTX: 64-bit
+    amdil,       // AMDIL
+    amdil64,     // AMDIL with 64-bit pointers
+    hsail,       // AMD HSAIL
+    hsail64,     // AMD HSAIL with 64-bit pointers
+    spir,        // SPIR: standard portable IR for OpenCL 32-bit version
+    spir64,      // SPIR: standard portable IR for OpenCL 64-bit version
+    spirv,       // SPIR-V with logical memory layout.
+    spirv32,     // SPIR-V with 32-bit pointers
+    spirv64,     // SPIR-V with 64-bit pointers
+    kalimba,     // Kalimba: generic kalimba
+    shave,       // SHAVE: Movidius vector VLIW processors
+    lanai,       // Lanai: Lanai 32-bit
+    wasm32,      // WebAssembly with 32-bit pointers
+    wasm64,      // WebAssembly with 64-bit pointers
     renderscript32, // 32-bit RenderScript
     renderscript64, // 64-bit RenderScript
     ve,             // NEC SX-Aurora Vector Engine
@@ -220,7 +222,6 @@ class Triple {
     ZOS,
     Haiku,
     RTEMS,
-    NaCl, // Native Client
     AIX,
     CUDA,   // NVIDIA CUDA
     NVCL,   // NVIDIA OpenCL
@@ -719,11 +720,6 @@ class Triple {
            isWindowsItaniumEnvironment();
   }
 
-  /// Tests whether the OS is NaCl (Native Client)
-  bool isOSNaCl() const {
-    return getOS() == Triple::NaCl;
-  }
-
   /// Tests whether the OS is Linux.
   bool isOSLinux() const {
     return getOS() == Triple::Linux;
@@ -1070,10 +1066,14 @@ class Triple {
   }
 
   /// Tests whether the target is 32-bit RISC-V.
-  bool isRISCV32() const { return getArch() == Triple::riscv32; }
+  bool isRISCV32() const {
+    return getArch() == Triple::riscv32 || getArch() == Triple::riscv32be;
+  }
 
   /// Tests whether the target is 64-bit RISC-V.
-  bool isRISCV64() const { return getArch() == Triple::riscv64; }
+  bool isRISCV64() const {
+    return getArch() == Triple::riscv64 || getArch() == Triple::riscv64be;
+  }
 
   /// Tests whether the target is RISC-V (32- and 64-bit).
   bool isRISCV() const { return isRISCV32() || isRISCV64(); }
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index df146458b4e6f..bb79d2568fca0 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -36,7 +36,6 @@ class BasicBlock;
 class BranchInst;
 class CallBase;
 class CallInst;
-class DbgVariableIntrinsic;
 class DIBuilder;
 class DomTreeUpdater;
 class Function;
@@ -275,36 +274,23 @@ LLVM_ABI CallInst *changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr);
 LLVM_ABI void InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI,
                                          DIBuilder &Builder);
 
-/// Creates and inserts an llvm.dbg.value intrinsic before a store
-/// that has an associated llvm.dbg.value intrinsic.
-LLVM_ABI void InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII,
-                                         StoreInst *SI, DIBuilder &Builder);
-
-/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                              StoreInst *SI,
-                                              DIBuilder &Builder);
+/// Inserts a dbg.value record before a store to an alloca'd value
+/// that has an associated dbg.declare record.
 LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                               StoreInst *SI,
                                               DIBuilder &Builder);
 
-/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                              LoadInst *LI, DIBuilder &Builder);
+/// Inserts a dbg.value record before a load of an alloca'd value
+/// that has an associated dbg.declare record.
 LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                               LoadInst *LI, DIBuilder &Builder);
 
-/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
-/// llvm.dbg.declare intrinsic.
-LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                              PHINode *LI, DIBuilder &Builder);
+/// Inserts a dbg.value record after a phi that has an associated
+/// llvm.dbg.declare record.
 LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                               PHINode *LI, DIBuilder &Builder);
 
-/// Lowers llvm.dbg.declare intrinsics into appropriate set of
-/// llvm.dbg.value intrinsics.
+/// Lowers dbg.declare records into appropriate set of dbg.value records.
 LLVM_ABI bool LowerDbgDeclare(Function &F);
 
 /// Propagate dbg.value intrinsics through the newly inserted PHIs.
@@ -312,7 +298,7 @@ LLVM_ABI void
 insertDebugValuesForPHIs(BasicBlock *BB,
                          SmallVectorImpl<PHINode *> &InsertedPHIs);
 
-/// Replaces llvm.dbg.declare instruction when the address it
+/// Replaces dbg.declare record when the address it
 /// describes is replaced with a new value. If Deref is true, an
 /// additional DW_OP_deref is prepended to the expression. If Offset
 /// is non-zero, a constant displacement is added to the expression
@@ -321,10 +307,10 @@ LLVM_ABI bool replaceDbgDeclare(Value *Address, Value *NewAddress,
                                 DIBuilder &Builder, uint8_t DIExprFlags,
                                 int Offset);
 
-/// Replaces multiple llvm.dbg.value instructions when the alloca it describes
+/// Replaces multiple dbg.value records when the alloca it describes
 /// is replaced with a new value. If Offset is non-zero, a constant displacement
 /// is added to the expression (after the mandatory Deref). Offset can be
-/// negative. New llvm.dbg.value instructions are inserted at the locations of
+/// negative. New dbg.value records are inserted at the locations of
 /// the instructions they replace.
 LLVM_ABI void replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                        DIBuilder &Builder, int Offset = 0);
diff --git a/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h b/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h
index 1b6309c7fb1a4..5b92b33a10ea0 100644
--- a/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h
+++ b/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h
@@ -61,7 +61,7 @@ class LockstepReverseIterator
     }
     Insts.clear();
     for (BasicBlock *BB : Blocks) {
-      Instruction *Prev = BB->getTerminator()->getPrevNonDebugInstruction();
+      Instruction *Prev = BB->getTerminator()->getPrevNode();
       if (!Prev) {
         // Block wasn't big enough - only contained a terminator.
         if constexpr (EarlyFailure) {
@@ -108,7 +108,7 @@ class LockstepReverseIterator
       return *this;
     SmallVector<Instruction *, 4> NewInsts;
     for (Instruction *Inst : Insts) {
-      Instruction *Prev = Inst->getPrevNonDebugInstruction();
+      Instruction *Prev = Inst->getPrevNode();
       if (!Prev) {
         if constexpr (!EarlyFailure) {
           this->ActiveBlocks.remove(Inst->getParent());
@@ -133,7 +133,7 @@ class LockstepReverseIterator
       return *this;
     SmallVector<Instruction *, 4> NewInsts;
     for (Instruction *Inst : Insts) {
-      Instruction *Next = Inst->getNextNonDebugInstruction();
+      Instruction *Next = Inst->getNextNode();
       // Already at end of block.
       if (!Next) {
         Fail = true;
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index 8b7daf616b110..f288bdfb84f49 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -23,7 +23,6 @@
 
 namespace llvm {
 class DominatorTree;
-class DbgVariableIntrinsic;
 class IntrinsicInst;
 class PostDominatorTree;
 class AllocaInst;
@@ -53,8 +52,6 @@ struct AllocaInfo {
   AllocaInst *AI;
   SmallVector<IntrinsicInst *, 2> LifetimeStart;
   SmallVector<IntrinsicInst *, 2> LifetimeEnd;
-  SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
-  // Non-intrinsic records of variable locations.
   SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
 };
 
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index a101151eed7cc..39fef921a9590 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -530,6 +530,7 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
 
   bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
 
+  Value *tryToReuseLCSSAPhi(const SCEVAddRecExpr *S);
   Value *expandAddRecExprLiterally(const SCEVAddRecExpr *);
   PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
                                      const Loop *L, Type *&TruncTy,
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index d7695e5cfc0d3..dd6ee32b80de8 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 428342f51ad2e..dd9a44b9aecac 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3670,14 +3670,12 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase);
   const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase);
 
-  if (Src != Dst) {
-    // Check that memory access offsets are multiples of element sizes.
-    if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) ||
-        !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) {
-      LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n");
-      return std::make_unique<Dependence>(Src, Dst,
-                                          SCEVUnionPredicate(Assume, *SE));
-    }
+  // Check that memory access offsets are multiples of element sizes.
+  if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) ||
+      !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) {
+    LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n");
+    return std::make_unique<Dependence>(Src, Dst,
+                                        SCEVUnionPredicate(Assume, *SE));
   }
 
   if (!Assume.empty()) {
diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 2cc3ad5f18482..92c9e37dbb484 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -102,8 +102,8 @@ class ValueEvolution {
 
 public:
   // ValueEvolution is meant to be constructed with the TripCount of the loop,
-  // and whether the polynomial algorithm is big-endian, for the significant-bit
-  // check.
+  // and a boolean indicating whether the polynomial algorithm is big-endian
+  // (for the significant-bit check).
   ValueEvolution(unsigned TripCount, bool ByteOrderSwapped);
 
   // Given a list of PHI nodes along with their incoming value from within the
@@ -115,6 +115,10 @@ class ValueEvolution {
   // precise error message.
   StringRef getError() const { return ErrStr; }
 
+  // A set of Instructions visited by ValueEvolution. The only unvisited
+  // instructions will be ones not on the use-def chain of the PHIs' evolutions.
+  SmallPtrSet<const Instruction *, 16> Visited;
+
   // The computed KnownBits for each PHI node, which is populated after
   // computeEvolutions is called.
   KnownPhiMap KnownPhis;
@@ -177,6 +181,9 @@ KnownBits ValueEvolution::computeBinOp(const BinaryOperator *I) {
 KnownBits ValueEvolution::computeInstr(const Instruction *I) {
   unsigned BitWidth = I->getType()->getScalarSizeInBits();
 
+  // computeInstr is the only entry-point that needs to update the Visited set.
+  Visited.insert(I);
+
   // We look up in the map that contains the KnownBits of the PHI from the
   // previous iteration.
   if (const PHINode *P = dyn_cast<PHINode>(I))
@@ -185,9 +192,12 @@ KnownBits ValueEvolution::computeInstr(const Instruction *I) {
   // Compute the KnownBits for a Select(Cmp()), forcing it to take the branch
   // that is predicated on the (least|most)-significant-bit check.
   CmpPredicate Pred;
-  Value *L, *R, *TV, *FV;
-  if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Value(TV),
-                        m_Value(FV)))) {
+  Value *L, *R;
+  Instruction *TV, *FV;
+  if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Instruction(TV),
+                        m_Instruction(FV)))) {
+    Visited.insert(cast<Instruction>(I->getOperand(0)));
+
     // We need to check LCR against [0, 2) in the little-endian case, because
     // the RCR check is insufficient: it is simply [0, 1).
     if (!ByteOrderSwapped) {
@@ -209,10 +219,17 @@ KnownBits ValueEvolution::computeInstr(const Instruction *I) {
     ConstantRange CheckRCR(APInt::getZero(ICmpBW),
                            ByteOrderSwapped ? APInt::getSignedMinValue(ICmpBW)
                                             : APInt(ICmpBW, 1));
-    if (AllowedR == CheckRCR)
+
+    // We only compute KnownBits of either TV or FV, as the other value would
+    // just be a bit-shift as checked by isBigEndianBitShift.
+    if (AllowedR == CheckRCR) {
+      Visited.insert(FV);
       return compute(TV);
-    if (AllowedR.inverse() == CheckRCR)
+    }
+    if (AllowedR.inverse() == CheckRCR) {
+      Visited.insert(TV);
       return compute(FV);
+    }
 
     ErrStr = "Bad RHS of significant-bit-check";
     return {BitWidth};
@@ -634,6 +651,17 @@ HashRecognize::recognizeCRC() const {
     return VE.getError();
   KnownBits ResultBits = VE.KnownPhis.at(ConditionalRecurrence.Phi);
 
+  // There must be exactly four unvisited instructions, corresponding to the
+  // IndVar PHI. Any other unvisited instructions from the KnownBits propagation
+  // can complicate the optimization, which replaces the entire loop with the
+  // table-lookup version of the hash algorithm.
+  std::initializer_list<const Instruction *> AugmentVisited = {
+      IndVar, Latch->getTerminator(), L.getLatchCmpInst(),
+      cast<Instruction>(IndVar->getIncomingValueForBlock(Latch))};
+  VE.Visited.insert_range(AugmentVisited);
+  if (std::distance(Latch->begin(), Latch->end()) != VE.Visited.size())
+    return "Found stray unvisited instructions";
+
   unsigned N = std::min(TC, ResultBits.getBitWidth());
   auto IsZero = [](const KnownBits &K) { return K.isZero(); };
   if (!checkExtractBits(ResultBits, N, IsZero, *ByteOrderSwapped))
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 147982fe7511f..95f30fd3f4275 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab)
     : Vocab(std::move(Vocab)), Valid(true) {}
 
 bool Vocabulary::isValid() const {
-  return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid;
+  return Vocab.size() == Vocabulary::expectedSize() && Valid;
 }
 
 size_t Vocabulary::size() const {
@@ -243,6 +243,17 @@ const ir2vec::Embedding &Vocabulary::operator[](const Value *Arg) const {
   return Vocab[MaxOpcodes + MaxTypeIDs + static_cast<unsigned>(ArgKind)];
 }
 
+StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+#define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
+  if (Opcode == NUM) {                                                         \
+    return #OPCODE;                                                            \
+  }
+#include "llvm/IR/Instruction.def"
+#undef HANDLE_INST
+  return "UnknownOpcode";
+}
+
 StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
   switch (TypeID) {
   case Type::VoidTyID:
@@ -279,6 +290,7 @@ StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
   case Type::TargetExtTyID:
     return "UnknownTy";
   }
+  return "UnknownTy";
 }
 
 StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
@@ -292,10 +304,11 @@ Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) {
   float DummyVal = 0.1f;
   // Create a dummy vocabulary with entries for all opcodes, types, and
   // operand
-  for (unsigned _ : seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxTypeIDs +
-                                Vocabulary::MaxOperandKinds)) {
+  for ([[maybe_unused]] unsigned _ :
+       seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxTypeIDs +
+                   Vocabulary::MaxOperandKinds)) {
     DummyVocab.push_back(Embedding(Dim, DummyVal));
-    DummyVal += 0.1;
+    DummyVal += 0.1f;
   }
   return DummyVocab;
 }
@@ -311,18 +324,28 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   return OperandKind::VariableID;
 }
 
+unsigned Vocabulary::getNumericID(unsigned Opcode) {
+  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+  return Opcode - 1; // Convert to zero-based index
+}
+
+unsigned Vocabulary::getNumericID(Type::TypeID TypeID) {
+  assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+  return MaxOpcodes + static_cast<unsigned>(TypeID);
+}
+
+unsigned Vocabulary::getNumericID(const Value *Op) {
+  unsigned Index = static_cast<unsigned>(getOperandKind(Op));
+  assert(Index < MaxOperandKinds && "Invalid OperandKind");
+  return MaxOpcodes + MaxTypeIDs + Index;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
-  assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds &&
+  assert(Pos < Vocabulary::expectedSize() &&
          "Position out of bounds in vocabulary");
   // Opcode
-  if (Pos < MaxOpcodes) {
-#define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
-  if (Pos == NUM - 1) {                                                        \
-    return #OPCODE;                                                            \
-  }
-#include "llvm/IR/Instruction.def"
-#undef HANDLE_INST
-  }
+  if (Pos < MaxOpcodes)
+    return getVocabKeyForOpcode(Pos + 1);
   // Type
   if (Pos < MaxOpcodes + MaxTypeIDs)
     return getVocabKeyForTypeID(static_cast<Type::TypeID>(Pos - MaxOpcodes));
@@ -430,21 +453,18 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   // Handle Opcodes
   std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
                                                  Embedding(Dim, 0));
-#define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
-  {                                                                            \
-    auto It = OpcVocab.find(#OPCODE);                                          \
-    if (It != OpcVocab.end())                                                  \
-      NumericOpcodeEmbeddings[NUM - 1] = It->second;                           \
-    else                                                                       \
-      handleMissingEntity(#OPCODE);                                            \
+  for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {
+    StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1);
+    auto It = OpcVocab.find(VocabKey.str());
+    if (It != OpcVocab.end())
+      NumericOpcodeEmbeddings[Opcode] = It->second;
+    else
+      handleMissingEntity(VocabKey.str());
   }
-#include "llvm/IR/Instruction.def"
-#undef HANDLE_INST
   Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(),
                NumericOpcodeEmbeddings.end());
 
-  // Handle Types using direct iteration through TypeID enum
-  // We iterate through all possible TypeID values and map them to embeddings
+  // Handle Types
   std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxTypeIDs,
                                                Embedding(Dim, 0));
   for (unsigned TypeID : seq(0u, Vocabulary::MaxTypeIDs)) {
@@ -498,7 +518,8 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
     // FIXME: Use default vocabulary
-    Ctx->emitError("IR2Vec vocabulary file path not specified");
+    Ctx->emitError("IR2Vec vocabulary file path not specified; You may need to "
+                   "set it using --ir2vec-vocab-path");
     return Vocabulary(); // Return invalid result
   }
   if (auto Err = readVocabulary()) {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f8f741575f87a..f3a32d3055edb 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2085,6 +2085,12 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   if (!isa<SCEVConstant>(Dist))
     FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt;
 
+  // If distance is a SCEVCouldNotCompute, return Unknown immediately.
+  if (isa<SCEVCouldNotCompute>(Dist)) {
+    LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n");
+    return Dependence::Unknown;
+  }
+
   return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
                                       TypeByteSize, AIsWrite, BIsWrite);
 }
@@ -2122,13 +2128,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       std::get<DepDistanceStrideAndSizeInfo>(Res);
   bool HasSameSize = TypeByteSize > 0;
 
-  if (isa<SCEVCouldNotCompute>(Dist)) {
-    if (CheckCompletelyBeforeOrAfter())
-      return Dependence::NoDep;
-    LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
-    return Dependence::Unknown;
-  }
-
   ScalarEvolution &SE = *PSE.getSE();
   auto &DL = InnermostLoop->getHeader()->getDataLayout();
 
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 901cfe03ecd33..518a634cdb363 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -37,7 +37,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/GenericLoopInfoImpl.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index d6f490cb69a52..3aa9909df8e55 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -353,7 +353,7 @@ static bool canSkipClobberingStore(const StoreInst *SI,
   if (BatchAA.alias(MemoryLocation::get(LI), MemLoc) != AliasResult::MustAlias)
     return false;
   unsigned NumVisitedInsts = 0;
-  for (const Instruction *I = LI; I != SI; I = I->getNextNonDebugInstruction())
+  for (const Instruction *I = LI; I != SI; I = I->getNextNode())
     if (++NumVisitedInsts > ScanLimit ||
         isModSet(BatchAA.getModRefInfo(I, MemLoc)))
       return false;
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 973c9b1d30ca1..e475be243123a 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1122,7 +1122,7 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_cabsf:
   case LibFunc_cabsl: {
     Type *RetTy = FTy.getReturnType();
-    if (!RetTy->isFloatingPointTy())
+    if (!RetTy->isFloatingPointTy() || NumParams == 0)
       return false;
 
     Type *ParamTy = FTy.getParamType(0);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 21f844c4d2f45..61a322be03da1 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7914,6 +7914,18 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) {
   case Intrinsic::smul_fix_sat:
   case Intrinsic::pow:
   case Intrinsic::powi:
+  case Intrinsic::sin:
+  case Intrinsic::sinh:
+  case Intrinsic::cos:
+  case Intrinsic::cosh:
+  case Intrinsic::sincos:
+  case Intrinsic::sincospi:
+  case Intrinsic::tan:
+  case Intrinsic::tanh:
+  case Intrinsic::asin:
+  case Intrinsic::acos:
+  case Intrinsic::atan:
+  case Intrinsic::atan2:
   case Intrinsic::canonicalize:
   case Intrinsic::sqrt:
     return true;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 7f0ed0b60a785..1b3da590cff7f 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -306,6 +306,15 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) {
   }
 }
 
+VectorType *llvm::getDeinterleavedVectorType(IntrinsicInst *DI) {
+  [[maybe_unused]] unsigned Factor =
+      getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
+  ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
+  assert(Factor && Factor == DISubtypes.size() &&
+         "unexpected deinterleave factor or result type");
+  return cast<VectorType>(DISubtypes[0]);
+}
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index e8d1aba63afb4..1ab521e383583 100644
--- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -23,7 +23,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index b9b394909bb4c..c3b4077b27dd8 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -110,7 +110,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializePostRAHazardRecognizerLegacyPass(Registry);
   initializePostRASchedulerLegacyPass(Registry);
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
-  initializeProcessImplicitDefsPass(Registry);
+  initializeProcessImplicitDefsLegacyPass(Registry);
   initializeRABasicPass(Registry);
   initializeRAGreedyLegacyPass(Registry);
   initializeRegAllocFastPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9bbb89e37865d..d9d41f1d72e35 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3015,7 +3015,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
         //   %phi = phi ptr [ %0, %bb0 ], [ %2, %entry ]
         if (PredBB && PredBB->getSingleSuccessor() == BB)
           CI = dyn_cast_or_null<CallInst>(
-              PredBB->getTerminator()->getPrevNonDebugInstruction(true));
+              PredBB->getTerminator()->getPrevNode());
 
         if (CI && CI->use_empty() &&
             isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
@@ -3032,7 +3032,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
     for (BasicBlock *Pred : predecessors(BB)) {
       if (!VisitedBBs.insert(Pred).second)
         continue;
-      if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) {
+      if (Instruction *I = Pred->rbegin()->getPrevNode()) {
         CallInst *CI = dyn_cast<CallInst>(I);
         if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
             attributesPermitTailCall(F, CI, RetI, *TLI)) {
@@ -7328,7 +7328,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
       !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
     return false;
 
-  IRBuilder<> Builder(Load->getNextNonDebugInstruction());
+  IRBuilder<> Builder(Load->getNextNode());
   auto *NewAnd = cast<Instruction>(
       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
   // Mark this instruction as "inserted by CGP", so that other
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3922eba55e195..e8f513ad5a7a9 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
   return false;
 }
 
-MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   auto &UDivorRem = cast<GenericMachineInstr>(MI);
@@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
   return ret;
 }
 
-bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   Register Dst = MI.getOperand(0).getReg();
@@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivorURemUsingMul(MI);
+void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildUDivOrURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM);
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
     return false;
 
   // If the sdiv has an 'exact' flag we can use a simpler lowering.
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+  if (Opcode == TargetOpcode::G_SDIV &&
+      MI.getFlag(MachineInstr::MIFlag::IsExact)) {
     return matchUnaryPredicate(
         MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
   }
@@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
     if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) &&
         !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}}))
       return false;
+    if (Opcode == TargetOpcode::G_SREM &&
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+      return false;
   }
 
   return matchUnaryPredicate(
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildSDivUsingMul(MI);
+void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildSDivOrSRemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
-  auto &SDiv = cast<GenericMachineInstr>(MI);
-  Register Dst = SDiv.getReg(0);
-  Register LHS = SDiv.getReg(1);
-  Register RHS = SDiv.getReg(2);
+MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(MI.getOpcode() == TargetOpcode::G_SDIV ||
+         Opcode == TargetOpcode::G_SREM);
+  auto &SDivorRem = cast<GenericMachineInstr>(MI);
+  Register Dst = SDivorRem.getReg(0);
+  Register LHS = SDivorRem.getReg(1);
+  Register RHS = SDivorRem.getReg(2);
   LLT Ty = MRI.getType(Dst);
   LLT ScalarTy = Ty.getScalarType();
   const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
   auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1);
   auto T = MIB.buildLShr(Ty, Q, SignShift);
   T = MIB.buildAnd(Ty, T, ShiftMask);
-  return MIB.buildAdd(Ty, Q, T);
+  auto ret = MIB.buildAdd(Ty, Q, T);
+
+  if (Opcode == TargetOpcode::G_SREM) {
+    auto Prod = MIB.buildMul(Ty, ret, RHS);
+    return MIB.buildSub(Ty, LHS, Prod);
+  }
+  return ret;
 }
 
 bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 92ecfadf97c99..73f11c1345daf 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -95,6 +95,10 @@ bool isEligibleFunction(Function *F) {
   if (F->getCallingConv() == CallingConv::SwiftTail)
     return false;
 
+  // Unnamed functions are skipped for simplicity.
+  if (!F->hasName())
+    return false;
+
   // If function contains callsites with musttail, if we merge
   // it, the merged function will have the musttail callsite, but
   // the number of parameters can change, thus the parameter count
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 7259834975cf4..d2b2edf2ebc80 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -367,34 +367,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   bool BinOpShuffleChanged =
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
+  Value *Mask = nullptr;
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
-    Value *LaneMask =
-        getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
-    if (!LaneMask)
+    Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
+    if (!Mask)
       return false;
-
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
-
-    // Sometimes the number of Shuffles might be less than Factor, we have to
-    // fill the gaps with null. Also, lowerInterleavedVPLoad
-    // expects them to be sorted.
-    SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
-    for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
-      ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
-    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
-      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-      return !Extracts.empty() || BinOpShuffleChanged;
   } else {
     LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
-
-    // Try to create target specific intrinsics to replace the load and
-    // shuffles.
-    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
-                                   Factor))
-      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-      return !Extracts.empty() || BinOpShuffleChanged;
   }
 
+  // Try to create target specific intrinsics to replace the load and
+  // shuffles.
+  if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
+                                 Indices, Factor))
+    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+    return !Extracts.empty() || BinOpShuffleChanged;
+
   DeadInsts.insert_range(Shuffles);
 
   DeadInsts.insert(Load);
@@ -618,40 +607,18 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   assert(Factor && "unexpected deinterleave intrinsic");
 
-  SmallVector<Value *, 8> DeinterleaveValues(Factor, nullptr);
-  Value *LastFactor = nullptr;
-  for (auto *User : DI->users()) {
-    auto *Extract = dyn_cast<ExtractValueInst>(User);
-    if (!Extract || Extract->getNumIndices() != 1)
-      return false;
-    unsigned Idx = Extract->getIndices()[0];
-    if (DeinterleaveValues[Idx])
-      return false;
-    DeinterleaveValues[Idx] = Extract;
-    LastFactor = Extract;
-  }
-
-  if (!LastFactor)
-    return false;
-
+  Value *Mask = nullptr;
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
     if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
-    Value *Mask =
-        getMask(WideMask, Factor, cast<VectorType>(LastFactor->getType()));
+    Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
     if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
                       << *DI << " and factor = " << Factor << "\n");
-
-    // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
-    // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
-      return false;
-
   } else {
     auto *LI = cast<LoadInst>(LoadedVal);
     if (!LI->isSimple())
@@ -659,15 +626,13 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
                       << " and factor = " << Factor << "\n");
-
-    // Try and match this with target specific intrinsics.
-    if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
-      return false;
   }
 
-  for (Value *V : DeinterleaveValues)
-    if (V)
-      DeadInsts.insert(cast<Instruction>(V));
+  // Try and match this with target specific intrinsics.
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(cast<Instruction>(LoadedVal), Mask,
+                                             DI))
+    return false;
+
   DeadInsts.insert(DI);
   // We now have a target-specific load, so delete the old one.
   DeadInsts.insert(cast<Instruction>(LoadedVal));
@@ -686,23 +651,19 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
   const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID());
   assert(Factor && "unexpected interleave intrinsic");
 
+  Value *Mask = nullptr;
   if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
     if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
       return false;
 
     Value *WideMask = VPStore->getOperand(2);
-    Value *Mask = getMask(WideMask, Factor,
-                          cast<VectorType>(InterleaveValues[0]->getType()));
+    Mask = getMask(WideMask, Factor,
+                   cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
     LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
                       << *II << " and factor = " << Factor << "\n");
-
-    // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
-    // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
-      return false;
   } else {
     auto *SI = cast<StoreInst>(StoredBy);
     if (!SI->isSimple())
@@ -710,12 +671,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
     LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
                       << " and factor = " << Factor << "\n");
-
-    // Try and match this with target specific intrinsics.
-    if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
-      return false;
   }
 
+  // Try and match this with target specific intrinsics.
+  if (!TLI->lowerInterleaveIntrinsicToStore(cast<Instruction>(StoredBy), Mask,
+                                            InterleaveValues))
+    return false;
+
   // We now have a target-specific store, so delete the old one.
   DeadInsts.insert(cast<Instruction>(StoredBy));
   DeadInsts.insert(II);
diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
index e7a4d6d61e211..116a919585d70 100644
--- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -45,3 +45,9 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
 
   return Result(std::move(MF));
 }
+
+PreservedAnalyses FreeMachineFunctionPass::run(Function &F,
+                                               FunctionAnalysisManager &FAM) {
+  FAM.clearAnalysis<MachineFunctionAnalysis>(F);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index a0e067a6323b7..9ec5151a039b7 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -2102,8 +2102,7 @@ class PostRAMachineSinkingLegacy : public MachineFunctionPass {
   }
 
   MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
+    return MachineFunctionProperties().setNoVRegs();
   }
 };
 
diff --git a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
index 6c84cc2b64324..37a9e6203af7b 100644
--- a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/ProcessImplicitDefs.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -26,24 +27,15 @@ using namespace llvm;
 namespace {
 /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
 /// for each use. Add isUndef marker to implicit_def defs and their uses.
-class ProcessImplicitDefs : public MachineFunctionPass {
-  const TargetInstrInfo *TII = nullptr;
-  const TargetRegisterInfo *TRI = nullptr;
-  MachineRegisterInfo *MRI = nullptr;
-
-  SmallSetVector<MachineInstr*, 16> WorkList;
-
-  void processImplicitDef(MachineInstr *MI);
-  bool canTurnIntoImplicitDef(MachineInstr *MI);
-
+class ProcessImplicitDefsLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  ProcessImplicitDefs() : MachineFunctionPass(ID) {
-    initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry());
+  ProcessImplicitDefsLegacy() : MachineFunctionPass(ID) {
+    initializeProcessImplicitDefsLegacyPass(*PassRegistry::getPassRegistry());
   }
 
-  void getAnalysisUsage(AnalysisUsage &au) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -51,15 +43,29 @@ class ProcessImplicitDefs : public MachineFunctionPass {
     return MachineFunctionProperties().setIsSSA();
   }
 };
+
+class ProcessImplicitDefs {
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+  SmallSetVector<MachineInstr *, 16> WorkList;
+
+  void processImplicitDef(MachineInstr *MI);
+  bool canTurnIntoImplicitDef(MachineInstr *MI);
+
+public:
+  bool run(MachineFunction &MF);
+};
 } // end anonymous namespace
 
-char ProcessImplicitDefs::ID = 0;
-char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
+char ProcessImplicitDefsLegacy::ID = 0;
+char &llvm::ProcessImplicitDefsID = ProcessImplicitDefsLegacy::ID;
 
-INITIALIZE_PASS(ProcessImplicitDefs, DEBUG_TYPE,
+INITIALIZE_PASS(ProcessImplicitDefsLegacy, DEBUG_TYPE,
                 "Process Implicit Definitions", false, false)
 
-void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
+void ProcessImplicitDefsLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<AAResultsWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -132,9 +138,24 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   LLVM_DEBUG(dbgs() << "Keeping physreg: " << *MI);
 }
 
+bool ProcessImplicitDefsLegacy::runOnMachineFunction(MachineFunction &MF) {
+  return ProcessImplicitDefs().run(MF);
+}
+
+PreservedAnalyses
+ProcessImplicitDefsPass::run(MachineFunction &MF,
+                             MachineFunctionAnalysisManager &MFAM) {
+  if (!ProcessImplicitDefs().run(MF))
+    return PreservedAnalyses::all();
+
+  return getMachineFunctionPassPreservedAnalyses()
+      .preserveSet<CFGAnalyses>()
+      .preserve<AAManager>();
+}
+
 /// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into
 /// <undef> operands.
-bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
+bool ProcessImplicitDefs::run(MachineFunction &MF) {
 
   LLVM_DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
                     << "********** Function: " << MF.getName() << '\n');
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
index 30523611977f4..9b23a6aac629a 100644
--- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index da229f86f24ce..996207034d076 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -475,8 +475,16 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, Instruction &RI,
       SplitBlockAndInsertIfThen(Cmp, &RI, /* Unreachable */ true, Weights, DTU);
   IRBuilder<> IRBFail(CheckTerm);
   // FIXME: respect -fsanitize-trap / -ftrap-function here?
+  const char *StackChkFailName =
+      TL.getLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL);
+  if (!StackChkFailName) {
+    F.getContext().emitError(
+        "no libcall available for stackprotector check fail");
+    return;
+  }
+
   FunctionCallee StackChkFail =
-      F.getParent()->getOrInsertFunction("__stack_chk_fail", IRB.getVoidTy());
+      F.getParent()->getOrInsertFunction(StackChkFailName, IRB.getVoidTy());
   IRBFail.CreateCall(StackChkFail, {});
 }
 
@@ -791,8 +799,16 @@ bool SafeStack::run() {
     IRB.SetCurrentDebugLocation(
         DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP));
   if (SafeStackUsePointerAddress) {
+    const char *SafestackPointerAddressName =
+        TL.getLibcallName(RTLIB::SAFESTACK_POINTER_ADDRESS);
+    if (!SafestackPointerAddressName) {
+      F.getContext().emitError(
+          "no libcall available for safestack pointer address");
+      return false;
+    }
+
     FunctionCallee Fn = F.getParent()->getOrInsertFunction(
-        "__safestack_pointer_address", IRB.getPtrTy(0));
+        SafestackPointerAddressName, IRB.getPtrTy(0));
     UnsafeStackPtr = IRB.CreateCall(Fn);
   } else {
     UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 231184587d682..40464e91f9efc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -609,6 +609,8 @@ namespace {
     SDValue foldABSToABD(SDNode *N, const SDLoc &DL);
     SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
                             SDValue False, ISD::CondCode CC, const SDLoc &DL);
+    SDValue foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True,
+                             SDValue False, ISD::CondCode CC, const SDLoc &DL);
     SDValue unfoldMaskedMerge(SDNode *N);
     SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
@@ -859,7 +861,7 @@ namespace {
       auto LK = TLI.getTypeConversion(*DAG.getContext(), VT);
       return (LK.first == TargetLoweringBase::TypeLegal ||
               LK.first == TargetLoweringBase::TypePromoteInteger) &&
-             TLI.isOperationLegal(ISD::UMIN, LK.second);
+             TLI.isOperationLegalOrCustom(ISD::UMIN, LK.second);
     }
 
   public:
@@ -2606,9 +2608,7 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
       return SDValue();
   }
 
-  SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
-  SelectOp->setFlags(BO->getFlags());
-  return SelectOp;
+  return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF, BO->getFlags());
 }
 
 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, const SDLoc &DL,
@@ -4095,6 +4095,26 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       return N0;
   }
 
+  // (sub x, ([v]select (ult x, y), 0, y)) -> (umin x, (sub x, y))
+  // (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y))
+  if (N1.hasOneUse() && hasUMin(VT)) {
+    SDValue Y;
+    if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
+                                      m_SpecificCondCode(ISD::SETULT)),
+                              m_Zero(), m_Deferred(Y))) ||
+        sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
+                                      m_SpecificCondCode(ISD::SETUGE)),
+                              m_Deferred(Y), m_Zero())) ||
+        sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
+                                       m_SpecificCondCode(ISD::SETULT)),
+                               m_Zero(), m_Deferred(Y))) ||
+        sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
+                                       m_SpecificCondCode(ISD::SETUGE)),
+                               m_Deferred(Y), m_Zero())))
+      return DAG.getNode(ISD::UMIN, DL, VT, N0,
+                         DAG.getNode(ISD::SUB, DL, VT, N0, Y));
+  }
+
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -4444,20 +4464,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       sd_match(N1, m_UMaxLike(m_Specific(A), m_Specific(B))))
     return DAG.getNegative(DAG.getNode(ISD::ABDU, DL, VT, A, B), DL, VT);
 
-  // (sub x, (select (ult x, y), 0, y)) -> (umin x, (sub x, y))
-  // (sub x, (select (uge x, y), y, 0)) -> (umin x, (sub x, y))
-  if (hasUMin(VT)) {
-    SDValue Y;
-    if (sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
-                                               m_SpecificCondCode(ISD::SETULT)),
-                                       m_Zero(), m_Deferred(Y)))) ||
-        sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
-                                               m_SpecificCondCode(ISD::SETUGE)),
-                                       m_Deferred(Y), m_Zero()))))
-      return DAG.getNode(ISD::UMIN, DL, VT, N0,
-                         DAG.getNode(ISD::SUB, DL, VT, N0, Y));
-  }
-
   return SDValue();
 }
 
@@ -9149,7 +9155,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
   if (Op.getOpcode() != ISD::LOAD && VectorIndex.has_value())
     return std::nullopt;
 
-  unsigned BitWidth = Op.getValueSizeInBits();
+  unsigned BitWidth = Op.getScalarValueSizeInBits();
   if (BitWidth % 8 != 0)
     return std::nullopt;
   unsigned ByteWidth = BitWidth / 8;
@@ -9248,7 +9254,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
     if (!L->isSimple() || L->isIndexed())
       return std::nullopt;
 
-    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
+    unsigned NarrowBitWidth = L->getMemoryVT().getScalarSizeInBits();
     if (NarrowBitWidth % 8 != 0)
       return std::nullopt;
     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
@@ -12175,6 +12181,30 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
   return SDValue();
 }
 
+// ([v]select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
+// ([v]select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
+SDValue DAGCombiner::foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True,
+                                      SDValue False, ISD::CondCode CC,
+                                      const SDLoc &DL) {
+  APInt C;
+  EVT VT = True.getValueType();
+  if (sd_match(RHS, m_ConstInt(C)) && hasUMin(VT)) {
+    if (CC == ISD::SETUGT && LHS == False &&
+        sd_match(True, m_Add(m_Specific(False), m_SpecificInt(~C)))) {
+      SDValue AddC = DAG.getConstant(~C, DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, False, AddC);
+      return DAG.getNode(ISD::UMIN, DL, VT, Add, False);
+    }
+    if (CC == ISD::SETULT && LHS == True &&
+        sd_match(False, m_Add(m_Specific(True), m_SpecificInt(-C)))) {
+      SDValue AddC = DAG.getConstant(-C, DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, True, AddC);
+      return DAG.getNode(ISD::UMIN, DL, VT, True, Add);
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12191,11 +12221,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     return V;
 
   // select (not Cond), N1, N2 -> select Cond, N2, N1
-  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
-    SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1);
-    SelectOp->setFlags(Flags);
-    return SelectOp;
-  }
+  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
+    return DAG.getSelect(DL, VT, F, N2, N1, Flags);
 
   if (SDValue V = foldSelectOfConstants(N))
     return V;
@@ -12363,24 +12390,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 
     // (select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
     // (select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
-    APInt C;
-    if (sd_match(Cond1, m_ConstInt(C)) && hasUMin(VT)) {
-      if (CC == ISD::SETUGT && Cond0 == N2 &&
-          sd_match(N1, m_Add(m_Specific(N2), m_SpecificInt(~C)))) {
-        // The resulting code relies on an unsigned wrap in ADD.
-        // Recreating ADD to drop possible nuw/nsw flags.
-        SDValue AddC = DAG.getConstant(~C, DL, VT);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N2, AddC);
-        return DAG.getNode(ISD::UMIN, DL, VT, Add, N2);
-      }
-      if (CC == ISD::SETULT && Cond0 == N1 &&
-          sd_match(N2, m_Add(m_Specific(N1), m_SpecificInt(-C)))) {
-        // Ditto.
-        SDValue AddC = DAG.getConstant(-C, DL, VT);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, AddC);
-        return DAG.getNode(ISD::UMIN, DL, VT, N1, Add);
-      }
-    }
+    if (SDValue UMin = foldSelectToUMin(Cond0, Cond1, N1, N2, CC, DL))
+      return UMin;
   }
 
   if (!VT.isVector())
@@ -13417,6 +13428,11 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
         }
       }
     }
+
+    // (vselect (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
+    // (vselect (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
+    if (SDValue UMin = foldSelectToUMin(LHS, RHS, N1, N2, CC, DL))
+      return UMin;
   }
 
   if (SimplifySelectOps(N, N1, N2))
@@ -13490,11 +13506,9 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
 
     // Fold to a simpler select_cc
     if (SCC.getOpcode() == ISD::SETCC) {
-      SDValue SelectOp =
-          DAG.getNode(ISD::SELECT_CC, DL, N2.getValueType(), SCC.getOperand(0),
-                      SCC.getOperand(1), N2, N3, SCC.getOperand(2));
-      SelectOp->setFlags(SCC->getFlags());
-      return SelectOp;
+      return DAG.getNode(ISD::SELECT_CC, DL, N2.getValueType(),
+                         SCC.getOperand(0), SCC.getOperand(1), N2, N3,
+                         SCC.getOperand(2), SCC->getFlags());
     }
   }
 
@@ -28194,14 +28208,16 @@ SDValue DAGCombiner::SimplifyVCastOp(SDNode *N, const SDLoc &DL) {
       TLI.preferScalarizeSplat(N)) {
     EVT SrcVT = N0.getValueType();
     EVT SrcEltVT = SrcVT.getVectorElementType();
-    SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
-    SDValue Elt =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcEltVT, Src0, IndexC);
-    SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, Elt, N->getFlags());
-    if (VT.isScalableVector())
-      return DAG.getSplatVector(VT, DL, ScalarBO);
-    SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
-    return DAG.getBuildVector(VT, DL, Ops);
+    if (!LegalTypes || TLI.isTypeLegal(SrcEltVT)) {
+      SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
+      SDValue Elt =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcEltVT, Src0, IndexC);
+      SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, Elt, N->getFlags());
+      if (VT.isScalableVector())
+        return DAG.getSplatVector(VT, DL, ScalarBO);
+      SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
+      return DAG.getBuildVector(VT, DL, Ops);
+    }
   }
 
   return SDValue();
@@ -28343,10 +28359,8 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
                                   SCC.getOperand(0), SCC.getOperand(1),
                                   SCC.getOperand(4), Flags);
       AddToWorklist(SETCC.getNode());
-      SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
-                                         SCC.getOperand(2), SCC.getOperand(3));
-      SelectNode->setFlags(Flags);
-      return SelectNode;
+      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
+                           SCC.getOperand(2), SCC.getOperand(3), Flags);
     }
 
     return SCC;
@@ -28647,9 +28661,9 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
     SDValue N10 = N1.getOperand(0);
     SDValue N20 = N2.getOperand(0);
     SDValue NewSel = DAG.getSelect(DL, N10.getValueType(), N0, N10, N20);
-    SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, NewSel, N1.getOperand(1));
-    NewBinOp->setFlags(N1->getFlags());
-    NewBinOp->intersectFlagsWith(N2->getFlags());
+    SDNodeFlags Flags = N1->getFlags() & N2->getFlags();
+    SDValue NewBinOp =
+        DAG.getNode(BinOpc, DL, OpVTs, {NewSel, N1.getOperand(1)}, Flags);
     return SDValue(NewBinOp.getNode(), N1.getResNo());
   }
 
@@ -28661,10 +28675,9 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
     // Second op VT might be different (e.g. shift amount type)
     if (N11.getValueType() == N21.getValueType()) {
       SDValue NewSel = DAG.getSelect(DL, N11.getValueType(), N0, N11, N21);
+      SDNodeFlags Flags = N1->getFlags() & N2->getFlags();
       SDValue NewBinOp =
-          DAG.getNode(BinOpc, DL, OpVTs, N1.getOperand(0), NewSel);
-      NewBinOp->setFlags(N1->getFlags());
-      NewBinOp->intersectFlagsWith(N2->getFlags());
+          DAG.getNode(BinOpc, DL, OpVTs, {N1.getOperand(0), NewSel}, Flags);
       return SDValue(NewBinOp.getNode(), N1.getResNo());
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 03d3e8eab35d0..85efb1bd8aed9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -784,7 +784,7 @@ MachineInstr *
 InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD,
                               VRBaseMapType &VRBaseMap) {
   MDNode *Var = SD->getVariable();
-  const DIExpression *Expr = (DIExpression *)SD->getExpression();
+  const DIExpression *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
   const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_INSTR_REF);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3656500ddf5ff..5453828177c72 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5569,6 +5569,12 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_VECTOR:
   case ISD::BUILD_PAIR:
   case ISD::SPLAT_VECTOR:
+  case ISD::FABS:
+    return false;
+
+  case ISD::ABS:
+    // ISD::ABS defines abs(INT_MIN) -> INT_MIN and never generates poison.
+    // Different to Intrinsic::abs.
     return false;
 
   case ISD::ADDC:
@@ -6745,7 +6751,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
     return SDValue();
   int64_t Offset = C2->getSExtValue();
   switch (Opcode) {
-  case ISD::ADD: break;
+  case ISD::ADD:
+  case ISD::PTRADD:
+    break;
   case ISD::SUB: Offset = -uint64_t(Offset); break;
   default: return SDValue();
   }
@@ -13867,6 +13875,8 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) {
     return;
   }
 
+  const SDNode *EntrySDN = getEntryNode().getNode();
+
   // We need to copy NodeExtraInfo to all _new_ nodes that are being introduced
   // through the replacement of From with To. Otherwise, replacements of a node
   // (From) with more complex nodes (To and its operands) may result in lost
@@ -13898,9 +13908,14 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) {
       return true;
     if (!Visited.insert(N).second)
       return true;
-    if (getEntryNode().getNode() == N)
+    if (EntrySDN == N)
       return false;
     for (const SDValue &Op : N->op_values()) {
+      if (N == To && Op.getNode() == EntrySDN) {
+        // Special case: New node's operand is the entry node; just need to
+        // copy extra info to new node.
+        break;
+      }
       if (!Self(Self, Op.getNode()))
         return false;
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ecd1ff87e7fbc..01e53123ea7e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -842,6 +843,23 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   }
 }
 
+static void failForInvalidBundles(const CallBase &I, StringRef Name,
+                                  ArrayRef<uint32_t> AllowedBundles) {
+  if (I.hasOperandBundlesOtherThan(AllowedBundles)) {
+    ListSeparator LS;
+    std::string Error;
+    raw_string_ostream OS(Error);
+    for (unsigned i = 0, e = I.getNumOperandBundles(); i != e; ++i) {
+      OperandBundleUse U = I.getOperandBundleAt(i);
+      if (!is_contained(AllowedBundles, U.getTagID()))
+        OS << LS << U.getTagName();
+    }
+    reportFatalUsageError(
+        Twine("cannot lower ", Name)
+            .concat(Twine(" with arbitrary operand bundles: ", Error)));
+  }
+}
+
 RegsForValue::RegsForValue(const SmallVector<Register, 4> &regs, MVT regvt,
                            EVT valuevt, std::optional<CallingConv::ID> CC)
     : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
@@ -3351,13 +3369,12 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
 
   // Deopt and ptrauth bundles are lowered in helper functions, and we don't
   // have to do anything here to lower funclet bundles.
-  if (I.hasOperandBundlesOtherThan(
-          {LLVMContext::OB_deopt, LLVMContext::OB_gc_transition,
-           LLVMContext::OB_gc_live, LLVMContext::OB_funclet,
-           LLVMContext::OB_cfguardtarget, LLVMContext::OB_ptrauth,
-           LLVMContext::OB_clang_arc_attachedcall}))
-    reportFatalUsageError(
-        "cannot lower invokes with arbitrary operand bundles!");
+  failForInvalidBundles(I, "invokes",
+                        {LLVMContext::OB_deopt, LLVMContext::OB_gc_transition,
+                         LLVMContext::OB_gc_live, LLVMContext::OB_funclet,
+                         LLVMContext::OB_cfguardtarget, LLVMContext::OB_ptrauth,
+                         LLVMContext::OB_clang_arc_attachedcall,
+                         LLVMContext::OB_kcfi});
 
   const Value *Callee(I.getCalledOperand());
   const Function *Fn = dyn_cast<Function>(Callee);
@@ -3457,10 +3474,8 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
 
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
-  if (I.hasOperandBundlesOtherThan(
-          {LLVMContext::OB_deopt, LLVMContext::OB_funclet}))
-    reportFatalUsageError(
-        "cannot lower callbrs with arbitrary operand bundles!");
+  failForInvalidBundles(I, "callbrs",
+                        {LLVMContext::OB_deopt, LLVMContext::OB_funclet});
 
   assert(I.isInlineAsm() && "Only know how to handle inlineasm callbr");
   visitInlineAsm(I);
@@ -9568,12 +9583,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   // CFGuardTarget bundles are lowered in LowerCallTo.
-  if (I.hasOperandBundlesOtherThan(
-          {LLVMContext::OB_deopt, LLVMContext::OB_funclet,
-           LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated,
-           LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_kcfi,
-           LLVMContext::OB_convergencectrl}))
-    reportFatalUsageError("cannot lower calls with arbitrary operand bundles!");
+  failForInvalidBundles(
+      I, "calls",
+      {LLVMContext::OB_deopt, LLVMContext::OB_funclet,
+       LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated,
+       LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_kcfi,
+       LLVMContext::OB_convergencectrl});
 
   SDValue Callee = getValue(I.getCalledOperand());
 
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 5f866eea7d4e7..b79911bcf3c49 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -76,7 +76,7 @@ static bool InsertStackProtectors(const TargetMachine *TM, Function *F,
 
 /// CreateFailBB - Create a basic block to jump to when the stack protector
 /// check fails.
-static BasicBlock *CreateFailBB(Function *F, const Triple &Trip);
+static BasicBlock *CreateFailBB(Function *F, const TargetLowering &TLI);
 
 bool SSPLayoutInfo::shouldEmitSDCheck(const BasicBlock &BB) const {
   return HasPrologue && !HasIRCheck && isa<ReturnInst>(BB.getTerminator());
@@ -626,7 +626,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
 
     // If we're instrumenting a block with a tail call, the check has to be
     // inserted before the call rather than between it and the return.
-    Instruction *Prev = CheckLoc->getPrevNonDebugInstruction();
+    Instruction *Prev = CheckLoc->getPrevNode();
     if (auto *CI = dyn_cast_if_present<CallInst>(Prev))
       if (CI->isTailCall() && isInTailCallPosition(*CI, *TM))
         CheckLoc = Prev;
@@ -673,7 +673,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
       // merge pass will merge together all of the various BB into one including
       // fail BB generated by the stack protector pseudo instruction.
       if (!FailBB)
-        FailBB = CreateFailBB(F, TM->getTargetTriple());
+        FailBB = CreateFailBB(F, *TLI);
 
       IRBuilder<> B(CheckLoc);
       Value *Guard = getStackGuard(TLI, M, B);
@@ -706,7 +706,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F,
   return HasPrologue;
 }
 
-BasicBlock *CreateFailBB(Function *F, const Triple &Trip) {
+BasicBlock *CreateFailBB(Function *F, const TargetLowering &TLI) {
   auto *M = F->getParent();
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
@@ -716,17 +716,25 @@ BasicBlock *CreateFailBB(Function *F, const Triple &Trip) {
         DILocation::get(Context, 0, 0, F->getSubprogram()));
   FunctionCallee StackChkFail;
   SmallVector<Value *, 1> Args;
-  if (Trip.isOSOpenBSD()) {
-    StackChkFail = M->getOrInsertFunction("__stack_smash_handler",
-                                          Type::getVoidTy(Context),
+
+  if (const char *ChkFailName =
+          TLI.getLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL)) {
+    StackChkFail =
+        M->getOrInsertFunction(ChkFailName, Type::getVoidTy(Context));
+  } else if (const char *SSHName =
+                 TLI.getLibcallName(RTLIB::STACK_SMASH_HANDLER)) {
+    StackChkFail = M->getOrInsertFunction(SSHName, Type::getVoidTy(Context),
                                           PointerType::getUnqual(Context));
     Args.push_back(B.CreateGlobalString(F->getName(), "SSH"));
   } else {
-    StackChkFail =
-        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
+    Context.emitError("no libcall available for stack protector");
   }
-  cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
-  B.CreateCall(StackChkFail, Args);
+
+  if (StackChkFail) {
+    CallInst *Call = B.CreateCall(StackChkFail, Args);
+    Call->addFnAttr(Attribute::NoReturn);
+  }
+
   B.CreateUnreachable();
   return FailBB;
 }
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 518a9339d8d11..18d6bbc0ff2b0 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -792,12 +792,18 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
 
   const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
   MachineBasicBlock::iterator Pos = MI;
-
-  if (Flags == MachineMemOperand::MOStore)
-    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
-                        Register());
-  else
+  if (Flags == MachineMemOperand::MOStore) {
+    if (MO.isUndef()) {
+      // If this is an undef copy, we do not need to bother we inserting spill
+      // code.
+      BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO);
+    } else {
+      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
+                          Register());
+    }
+  } else
     loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register());
+
   return &*--Pos;
 }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6feeb19bb8589..d4a34555ed820 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1965,15 +1965,26 @@ TargetLoweringBase::getDefaultSafeStackPointerLocation(IRBuilderBase &IRB,
 
 Value *
 TargetLoweringBase::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
+  // FIXME: Can this triple check be replaced with SAFESTACK_POINTER_ADDRESS
+  // being available?
   if (!TM.getTargetTriple().isAndroid())
     return getDefaultSafeStackPointerLocation(IRB, true);
 
-  // Android provides a libc function to retrieve the address of the current
-  // thread's unsafe stack pointer.
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   auto *PtrTy = PointerType::getUnqual(M->getContext());
+
+  const char *SafestackPointerAddressName =
+      getLibcallName(RTLIB::SAFESTACK_POINTER_ADDRESS);
+  if (!SafestackPointerAddressName) {
+    M->getContext().emitError(
+        "no libcall available for safestack pointer address");
+    return PoisonValue::get(PtrTy);
+  }
+
+  // Android provides a libc function to retrieve the address of the current
+  // thread's unsafe stack pointer.
   FunctionCallee Fn =
-      M->getOrInsertFunction("__safestack_pointer_address", PtrTy);
+      M->getOrInsertFunction(SafestackPointerAddressName, PtrTy);
   return IRB.CreateCall(Fn);
 }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 5454cd475f5ed..7e501a9e2aa44 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -322,28 +322,7 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
     }
   }
 
-  if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) {
-    // Emit a descriptor for every function including functions that have an
-    // available external linkage. We may not want this for imported functions
-    // that has code in another thinLTO module but we don't have a good way to
-    // tell them apart from inline functions defined in header files. Therefore
-    // we put each descriptor in a separate comdat section and rely on the
-    // linker to deduplicate.
-    for (const auto *Operand : FuncInfo->operands()) {
-      const auto *MD = cast<MDNode>(Operand);
-      auto *GUID = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0));
-      auto *Hash = mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
-      auto *Name = cast<MDString>(MD->getOperand(2));
-      auto *S = C.getObjectFileInfo()->getPseudoProbeDescSection(
-          TM->getFunctionSections() ? Name->getString() : StringRef());
-
-      Streamer.switchSection(S);
-      Streamer.emitInt64(GUID->getZExtValue());
-      Streamer.emitInt64(Hash->getZExtValue());
-      Streamer.emitULEB128IntValue(Name->getString().size());
-      Streamer.emitBytes(Name->getString());
-    }
-  }
+  emitPseudoProbeDescMetadata(Streamer, M);
 
   if (NamedMDNode *LLVMStats = M.getNamedMetadata("llvm.stats")) {
     // Emit the metadata for llvm statistics into .llvm_stats section, which is
diff --git a/llvm/lib/DWARFCFIChecker/Registers.h b/llvm/lib/DWARFCFIChecker/Registers.h
index 262fb5cfa7af9..a372c4c4345bd 100644
--- a/llvm/lib/DWARFCFIChecker/Registers.h
+++ b/llvm/lib/DWARFCFIChecker/Registers.h
@@ -38,8 +38,7 @@ inline SmallVector<MCPhysReg> getSuperRegs(const MCRegisterInfo *MCRI) {
     }
 
   sort(SuperRegs.begin(), SuperRegs.end());
-  SuperRegs.resize(std::distance(
-      SuperRegs.begin(), std::unique(SuperRegs.begin(), SuperRegs.end())));
+  SuperRegs.erase(llvm::unique(SuperRegs), SuperRegs.end());
   return SuperRegs;
 }
 
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 222dc88098102..559d808a72f98 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -43,6 +43,12 @@ namespace llvm {
 using namespace dwarf_linker;
 using namespace dwarf_linker::classic;
 
+enum InvalidStmtSeqOffset {
+  MaxStmtSeqOffset = UINT64_MAX,
+  OrigOffsetMissing = MaxStmtSeqOffset - 1,
+  NewOffsetMissing = MaxStmtSeqOffset - 2,
+};
+
 /// Hold the input and output of the debug info size in bytes.
 struct DebugInfoSize {
   uint64_t Input;
@@ -2315,7 +2321,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
           // Some sequences are discarded by the DWARFLinker if they are invalid
           // (empty).
           if (OrigRowIter == SeqOffToOrigRow.end()) {
-            StmtSeq.set(UINT64_MAX);
+            StmtSeq.set(OrigOffsetMissing);
             continue;
           }
           size_t OrigRowIndex = OrigRowIter->second;
@@ -2325,7 +2331,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
           if (NewRowIter == OrigRowToNewRow.end()) {
             // If the original row index is not found in the map, update the
             // stmt_sequence attribute to the 'invalid offset' magic value.
-            StmtSeq.set(UINT64_MAX);
+            StmtSeq.set(NewOffsetMissing);
             continue;
           }
 
diff --git a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.cpp b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.cpp
index 25a0ccb50c8a9..eaf56b3071592 100644
--- a/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.cpp
+++ b/llvm/lib/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.cpp
@@ -7,17 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h"
-#include "llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cinttypes>
 #include <cstdint>
-#include <optional>
 
 using namespace llvm;
 using namespace dwarf;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 170224616ac64..840ca8364e218 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5376,58 +5376,90 @@ void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
 
 void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
                                       Value *IfCond, ValueToValueMapTy &VMap,
+                                      LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
                                       const Twine &NamePrefix) {
   Function *F = CanonicalLoop->getFunction();
 
-  // Define where if branch should be inserted
-  Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
-
-  // TODO: We should not rely on pass manager. Currently we use pass manager
-  // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
-  // object. We should have a method  which returns all blocks between
-  // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
-  FunctionAnalysisManager FAM;
-  FAM.registerPass([]() { return DominatorTreeAnalysis(); });
-  FAM.registerPass([]() { return LoopAnalysis(); });
-  FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
+  // We can't do
+  // if (cond) {
+  //   simd_loop;
+  // } else {
+  //   non_simd_loop;
+  // }
+  // because then the CanonicalLoopInfo would only point to one of the loops:
+  // leading to other constructs operating on the same loop to malfunction.
+  // Instead generate
+  // while (...) {
+  //   if (cond) {
+  //     simd_body;
+  //   } else {
+  //     not_simd_body;
+  //   }
+  // }
+  // At least for simple loops, LLVM seems able to hoist the if out of the loop
+  // body at -O3
 
-  // Get the loop which needs to be cloned
-  LoopAnalysis LIA;
-  LoopInfo &&LI = LIA.run(*F, FAM);
-  Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
+  // Define where if branch should be inserted
+  auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
 
   // Create additional blocks for the if statement
-  BasicBlock *Head = SplitBefore->getParent();
-  Instruction *HeadOldTerm = Head->getTerminator();
-  llvm::LLVMContext &C = Head->getContext();
+  BasicBlock *Cond = SplitBeforeIt->getParent();
+  llvm::LLVMContext &C = Cond->getContext();
   llvm::BasicBlock *ThenBlock = llvm::BasicBlock::Create(
-      C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
+      C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
   llvm::BasicBlock *ElseBlock = llvm::BasicBlock::Create(
-      C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
+      C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
 
   // Create if condition branch.
-  Builder.SetInsertPoint(HeadOldTerm);
+  Builder.SetInsertPoint(SplitBeforeIt);
   Instruction *BrInstr =
       Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
   InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
-  // Then block contains branch to omp loop which needs to be vectorized
+  // Then block contains branch to omp loop body which needs to be vectorized
   spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
-  ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
+  ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
 
   Builder.SetInsertPoint(ElseBlock);
 
   // Clone loop for the else branch
   SmallVector<BasicBlock *, 8> NewBlocks;
 
-  VMap[CanonicalLoop->getPreheader()] = ElseBlock;
-  for (BasicBlock *Block : L->getBlocks()) {
+  SmallVector<BasicBlock *, 8> ExistingBlocks;
+  ExistingBlocks.reserve(L->getNumBlocks() + 1);
+  ExistingBlocks.push_back(ThenBlock);
+  ExistingBlocks.append(L->block_begin(), L->block_end());
+  // Cond is the block that has the if clause condition
+  // LoopCond is omp_loop.cond
+  // LoopHeader is omp_loop.header
+  BasicBlock *LoopCond = Cond->getUniquePredecessor();
+  BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
+  assert(LoopCond && LoopHeader && "Invalid loop structure");
+  for (BasicBlock *Block : ExistingBlocks) {
+    if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
+        Block == LoopHeader || Block == LoopCond || Block == Cond) {
+      continue;
+    }
     BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
+
+    // fix name not to be omp.if.then
+    if (Block == ThenBlock)
+      NewBB->setName(NamePrefix + ".if.else");
+
     NewBB->moveBefore(CanonicalLoop->getExit());
     VMap[Block] = NewBB;
     NewBlocks.push_back(NewBB);
   }
   remapInstructionsInBlocks(NewBlocks, VMap);
   Builder.CreateBr(NewBlocks.front());
+
+  // The loop latch must have only one predecessor. Currently it is branched to
+  // from both the 'then' and 'else' branches.
+  L->getLoopLatch()->splitBasicBlock(
+      L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
+
+  // Ensure that the then block is added to the loop so we add the attributes in
+  // the next step
+  L->addBasicBlockToLoop(ThenBlock, LI);
 }
 
 unsigned
@@ -5483,20 +5515,7 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
 
   if (IfCond) {
     ValueToValueMapTy VMap;
-    createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
-    // Add metadata to the cloned loop which disables vectorization
-    Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
-    assert(MappedLatch &&
-           "Cannot find value which corresponds to original loop latch");
-    assert(isa<BasicBlock>(MappedLatch) &&
-           "Cannot cast mapped latch block value to BasicBlock");
-    BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
-    ConstantAsMetadata *BoolConst =
-        ConstantAsMetadata::get(ConstantInt::getFalse(Type::getInt1Ty(Ctx)));
-    addBasicBlockMetadata(
-        NewLatchBlock,
-        {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
-                           BoolConst})});
+    createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
   }
 
   SmallSet<BasicBlock *, 8> Reachable;
@@ -5530,6 +5549,14 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
         Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
   }
 
+  // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
+  // versions so we can't add the loop attributes in that case.
+  if (IfCond) {
+    // we can still add llvm.loop.parallel_access
+    addLoopMetadata(CanonicalLoop, LoopMDList);
+    return;
+  }
+
   // Use the above access group metadata to create loop level
   // metadata, which should be distinct for each loop.
   ConstantAsMetadata *BoolConst =
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 84a56058de834..8fb33c30e5cac 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2288,39 +2288,36 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   // Collect a map of {backing storage : dbg.declares} (currently "backing
   // storage" is limited to Allocas). We'll use this to find dbg.declares to
   // delete after running `trackAssignments`.
-  DenseMap<const AllocaInst *, SmallPtrSet<DbgDeclareInst *, 2>> DbgDeclares;
   DenseMap<const AllocaInst *, SmallPtrSet<DbgVariableRecord *, 2>> DVRDeclares;
   // Create another similar map of {storage : variables} that we'll pass to
   // trackAssignments.
   StorageToVarsMap Vars;
-  auto ProcessDeclare = [&](auto *Declare, auto &DeclareList) {
+  auto ProcessDeclare = [&](DbgVariableRecord &Declare) {
     // FIXME: trackAssignments doesn't let you specify any modifiers to the
     // variable (e.g. fragment) or location (e.g. offset), so we have to
     // leave dbg.declares with non-empty expressions in place.
-    if (Declare->getExpression()->getNumElements() != 0)
+    if (Declare.getExpression()->getNumElements() != 0)
       return;
-    if (!Declare->getAddress())
+    if (!Declare.getAddress())
       return;
     if (AllocaInst *Alloca =
-            dyn_cast<AllocaInst>(Declare->getAddress()->stripPointerCasts())) {
+            dyn_cast<AllocaInst>(Declare.getAddress()->stripPointerCasts())) {
       // FIXME: Skip VLAs for now (let these variables use dbg.declares).
       if (!Alloca->isStaticAlloca())
         return;
       // Similarly, skip scalable vectors (use dbg.declares instead).
       if (auto Sz = Alloca->getAllocationSize(*DL); Sz && Sz->isScalable())
         return;
-      DeclareList[Alloca].insert(Declare);
-      Vars[Alloca].insert(VarRecord(Declare));
+      DVRDeclares[Alloca].insert(&Declare);
+      Vars[Alloca].insert(VarRecord(&Declare));
     }
   };
   for (auto &BB : F) {
     for (auto &I : BB) {
       for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
         if (DVR.isDbgDeclare())
-          ProcessDeclare(&DVR, DVRDeclares);
+          ProcessDeclare(DVR);
       }
-      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(&I))
-        ProcessDeclare(DDI, DbgDeclares);
     }
   }
 
@@ -2336,8 +2333,8 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   trackAssignments(F.begin(), F.end(), Vars, *DL);
 
   // Delete dbg.declares for variables now tracked with assignment tracking.
-  auto DeleteSubsumedDeclare = [&](const auto &Markers, auto &Declares) {
-    (void)Markers;
+  for (auto &[Insts, Declares] : DVRDeclares) {
+    auto Markers = at::getDVRAssignmentMarkers(Insts);
     for (auto *Declare : Declares) {
       // Assert that the alloca that Declare uses is now linked to a dbg.assign
       // describing the same variable (i.e. check that this dbg.declare has
@@ -2356,10 +2353,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
       Changed = true;
     }
   };
-  for (auto &P : DbgDeclares)
-    DeleteSubsumedDeclare(at::getAssignmentMarkers(P.first), P.second);
-  for (auto &P : DVRDeclares)
-    DeleteSubsumedDeclare(at::getDVRAssignmentMarkers(P.first), P.second);
   return Changed;
 }
 
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 2270923bd3719..f16963dce56e1 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -49,20 +49,11 @@ uint32_t DIType::getAlignInBits() const {
 const DIExpression::FragmentInfo DebugVariable::DefaultFragment = {
     std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
 
-DebugVariable::DebugVariable(const DbgVariableIntrinsic *DII)
-    : Variable(DII->getVariable()),
-      Fragment(DII->getExpression()->getFragmentInfo()),
-      InlinedAt(DII->getDebugLoc().getInlinedAt()) {}
-
 DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
     : Variable(DVR->getVariable()),
       Fragment(DVR->getExpression()->getFragmentInfo()),
       InlinedAt(DVR->getDebugLoc().getInlinedAt()) {}
 
-DebugVariableAggregate::DebugVariableAggregate(const DbgVariableIntrinsic *DVI)
-    : DebugVariable(DVI->getVariable(), std::nullopt,
-                    DVI->getDebugLoc()->getInlinedAt()) {}
-
 DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
                        unsigned Column, uint64_t AtomGroup, uint8_t AtomRank,
                        ArrayRef<Metadata *> MDs, bool ImplicitCode)
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index cbf39b8adf1b2..763cc1832b794 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1235,26 +1235,7 @@ bool Instruction::isDebugOrPseudoInst() const {
   return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
 }
 
-const Instruction *
-Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
-  for (const Instruction *I = getNextNode(); I; I = I->getNextNode())
-    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
-      return I;
-  return nullptr;
-}
-
-const Instruction *
-Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const {
-  for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode())
-    if (!isa<DbgInfoIntrinsic>(I) && !(SkipPseudoOp && isa<PseudoProbeInst>(I)))
-      return I;
-  return nullptr;
-}
-
 const DebugLoc &Instruction::getStableDebugLoc() const {
-  if (isa<DbgInfoIntrinsic>(this))
-    if (const Instruction *Next = getNextNonDebugInstruction())
-      return Next->getDebugLoc();
   return getDebugLoc();
 }
 
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 9fe79366f9b2f..b8963823f1c67 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -355,7 +355,7 @@ bool CallBase::isTailCall() const {
 }
 
 Intrinsic::ID CallBase::getIntrinsicID() const {
-  if (auto *F = getCalledFunction())
+  if (auto *F = dyn_cast_or_null<Function>(getCalledOperand()))
     return F->getIntrinsicID();
   return Intrinsic::not_intrinsic;
 }
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index d2632d50dff06..6c35ade3e57c5 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -661,20 +661,20 @@ static int lookupLLVMIntrinsicByName(ArrayRef<unsigned> NameOffsetTable,
       // `equal_range` requires the comparison to work with either side being an
       // offset or the value. Detect which kind each side is to set up the
       // compared strings.
-      StringRef LHSStr;
-      if constexpr (std::is_integral_v<decltype(LHS)>) {
-        LHSStr = IntrinsicNameTable[LHS];
-      } else {
+      const char *LHSStr;
+      if constexpr (std::is_integral_v<decltype(LHS)>)
+        LHSStr = IntrinsicNameTable.getCString(LHS);
+      else
         LHSStr = LHS;
-      }
-      StringRef RHSStr;
-      if constexpr (std::is_integral_v<decltype(RHS)>) {
-        RHSStr = IntrinsicNameTable[RHS];
-      } else {
+
+      const char *RHSStr;
+      if constexpr (std::is_integral_v<decltype(RHS)>)
+        RHSStr = IntrinsicNameTable.getCString(RHS);
+      else
         RHSStr = RHS;
-      }
-      return strncmp(LHSStr.data() + CmpStart, RHSStr.data() + CmpStart,
-                     CmpEnd - CmpStart) < 0;
+
+      return strncmp(LHSStr + CmpStart, RHSStr + CmpStart, CmpEnd - CmpStart) <
+             0;
     };
     LastLow = Low;
     std::tie(Low, High) = std::equal_range(Low, High, Name.data(), Cmp);
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 1d9a61ce4a10b..47a828842b481 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp
index 427e8b78fd03f..29ca268408265 100644
--- a/llvm/lib/IR/OptBisect.cpp
+++ b/llvm/lib/IR/OptBisect.cpp
@@ -25,6 +25,11 @@ static OptBisect &getOptBisector() {
   return OptBisector;
 }
 
+static OptDisable &getOptDisabler() {
+  static OptDisable OptDisabler;
+  return OptDisabler;
+}
+
 static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden,
                                    cl::init(OptBisect::Disabled), cl::Optional,
                                    cl::cb<void, int>([](int Limit) {
@@ -37,6 +42,18 @@ static cl::opt<bool> OptBisectVerbose(
     cl::desc("Show verbose output when opt-bisect-limit is set"), cl::Hidden,
     cl::init(true), cl::Optional);
 
+static cl::list<std::string> OptDisablePasses(
+    "opt-disable", cl::Hidden, cl::CommaSeparated, cl::Optional,
+    cl::cb<void, std::string>([](const std::string &Pass) {
+      getOptDisabler().setDisabled(Pass);
+    }),
+    cl::desc("Optimization pass(es) to disable (comma-separated list)"));
+
+static cl::opt<bool>
+    OptDisableVerbose("opt-disable-enable-verbosity",
+                      cl::desc("Show verbose output when opt-disable is set"),
+                      cl::Hidden, cl::init(false), cl::Optional);
+
 static void printPassMessage(StringRef Name, int PassNum, StringRef TargetDesc,
                              bool Running) {
   StringRef Status = Running ? "" : "NOT ";
@@ -55,4 +72,27 @@ bool OptBisect::shouldRunPass(StringRef PassName,
   return ShouldRun;
 }
 
-OptPassGate &llvm::getGlobalPassGate() { return getOptBisector(); }
+static void printDisablePassMessage(const StringRef &Name, StringRef TargetDesc,
+                                    bool Running) {
+  StringRef Status = Running ? "" : "NOT ";
+  dbgs() << "OptDisable: " << Status << "running pass " << Name << " on "
+         << TargetDesc << "\n";
+}
+
+void OptDisable::setDisabled(StringRef Pass) { DisabledPasses.insert(Pass); }
+
+bool OptDisable::shouldRunPass(StringRef PassName,
+                               StringRef IRDescription) const {
+  assert(isEnabled());
+
+  const bool ShouldRun = !DisabledPasses.contains(PassName);
+  if (OptDisableVerbose)
+    printDisablePassMessage(PassName, IRDescription, ShouldRun);
+  return ShouldRun;
+}
+
+OptPassGate &llvm::getGlobalPassGate() {
+  if (getOptDisabler().isEnabled())
+    return getOptDisabler();
+  return getOptBisector();
+}
diff --git a/llvm/lib/IR/Pass.cpp b/llvm/lib/IR/Pass.cpp
index 2c5ef7193b463..dec7c9a9ab18c 100644
--- a/llvm/lib/IR/Pass.cpp
+++ b/llvm/lib/IR/Pass.cpp
@@ -62,8 +62,12 @@ static std::string getDescription(const Module &M) {
 
 bool ModulePass::skipModule(const Module &M) const {
   const OptPassGate &Gate = M.getContext().getOptPassGate();
-  return Gate.isEnabled() &&
-         !Gate.shouldRunPass(this->getPassName(), getDescription(M));
+
+  StringRef PassName = getPassArgument();
+  if (PassName.empty())
+    PassName = this->getPassName();
+
+  return Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(M));
 }
 
 bool Pass::mustPreserveAnalysisID(char &AID) const {
@@ -86,6 +90,16 @@ StringRef Pass::getPassName() const {
   return "Unnamed pass: implement Pass::getPassName()";
 }
 
+/// getPassArgument - Return a nice clean name for a pass
+/// corresponding to that used to enable the pass in opt
+StringRef Pass::getPassArgument() const {
+  AnalysisID AID = getPassID();
+  const PassInfo *PI = Pass::lookupPassInfo(AID);
+  if (PI)
+    return PI->getPassArgument();
+  return "";
+}
+
 void Pass::preparePassManager(PMStack &) {
   // By default, don't do anything.
 }
@@ -173,8 +187,12 @@ static std::string getDescription(const Function &F) {
 
 bool FunctionPass::skipFunction(const Function &F) const {
   OptPassGate &Gate = F.getContext().getOptPassGate();
-  if (Gate.isEnabled() &&
-      !Gate.shouldRunPass(this->getPassName(), getDescription(F)))
+
+  StringRef PassName = getPassArgument();
+  if (PassName.empty())
+    PassName = this->getPassName();
+
+  if (Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(F)))
     return true;
 
   if (F.hasOptNone()) {
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 74cde5e9cee92..b1864897dafa6 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/RuntimeLibcalls.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/StringTable.h"
 
 using namespace llvm;
 using namespace RTLIB;
@@ -62,69 +62,6 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT,
     Info.setLibcallImplCallingConv(Impl, CallingConv::ARM_AAPCS);
 }
 
-static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
-                                    bool FiniteOnlyFuncs = false) {
-  Info.setLibcallImpl(RTLIB::REM_F128, RTLIB::fmodf128);
-  Info.setLibcallImpl(RTLIB::FMA_F128, RTLIB::fmaf128);
-  Info.setLibcallImpl(RTLIB::SQRT_F128, RTLIB::sqrtf128);
-  Info.setLibcallImpl(RTLIB::CBRT_F128, RTLIB::cbrtf128);
-  Info.setLibcallImpl(RTLIB::LOG_F128, RTLIB::logf128);
-  Info.setLibcallImpl(RTLIB::LOG2_F128, RTLIB::log2f128);
-  Info.setLibcallImpl(RTLIB::LOG10_F128, RTLIB::log10f128);
-  Info.setLibcallImpl(RTLIB::EXP_F128, RTLIB::expf128);
-  Info.setLibcallImpl(RTLIB::EXP2_F128, RTLIB::exp2f128);
-  Info.setLibcallImpl(RTLIB::EXP10_F128, RTLIB::exp10f128);
-  Info.setLibcallImpl(RTLIB::SIN_F128, RTLIB::sinf128);
-  Info.setLibcallImpl(RTLIB::COS_F128, RTLIB::cosf128);
-  Info.setLibcallImpl(RTLIB::TAN_F128, RTLIB::tanf128);
-  Info.setLibcallImpl(RTLIB::SINCOS_F128, RTLIB::sincosf128);
-  Info.setLibcallImpl(RTLIB::ASIN_F128, RTLIB::asinf128);
-  Info.setLibcallImpl(RTLIB::ACOS_F128, RTLIB::acosf128);
-  Info.setLibcallImpl(RTLIB::ATAN_F128, RTLIB::atanf128);
-  Info.setLibcallImpl(RTLIB::ATAN2_F128, RTLIB::atan2f128);
-  Info.setLibcallImpl(RTLIB::SINH_F128, RTLIB::sinhf128);
-  Info.setLibcallImpl(RTLIB::COSH_F128, RTLIB::coshf128);
-  Info.setLibcallImpl(RTLIB::TANH_F128, RTLIB::tanhf128);
-  Info.setLibcallImpl(RTLIB::POW_F128, RTLIB::powf128);
-  Info.setLibcallImpl(RTLIB::CEIL_F128, RTLIB::ceilf128);
-  Info.setLibcallImpl(RTLIB::TRUNC_F128, RTLIB::truncf128);
-  Info.setLibcallImpl(RTLIB::RINT_F128, RTLIB::rintf128);
-  Info.setLibcallImpl(RTLIB::NEARBYINT_F128, RTLIB::nearbyintf128);
-  Info.setLibcallImpl(RTLIB::ROUND_F128, RTLIB::roundf128);
-  Info.setLibcallImpl(RTLIB::ROUNDEVEN_F128, RTLIB::roundevenf128);
-  Info.setLibcallImpl(RTLIB::FLOOR_F128, RTLIB::floorf128);
-  Info.setLibcallImpl(RTLIB::COPYSIGN_F128, RTLIB::copysignf128);
-  Info.setLibcallImpl(RTLIB::FMIN_F128, RTLIB::fminf128);
-  Info.setLibcallImpl(RTLIB::FMAX_F128, RTLIB::fmaxf128);
-  Info.setLibcallImpl(RTLIB::FMINIMUM_F128, RTLIB::fminimumf128);
-  Info.setLibcallImpl(RTLIB::FMAXIMUM_F128, RTLIB::fmaximumf128);
-  Info.setLibcallImpl(RTLIB::FMINIMUM_NUM_F128, RTLIB::fminimum_numf128);
-  Info.setLibcallImpl(RTLIB::FMAXIMUM_NUM_F128, RTLIB::fmaximum_numf128);
-  Info.setLibcallImpl(RTLIB::LROUND_F128, RTLIB::lroundf128);
-  Info.setLibcallImpl(RTLIB::LLROUND_F128, RTLIB::llroundf128);
-  Info.setLibcallImpl(RTLIB::LRINT_F128, RTLIB::lrintf128);
-  Info.setLibcallImpl(RTLIB::LLRINT_F128, RTLIB::llrintf128);
-  Info.setLibcallImpl(RTLIB::LDEXP_F128, RTLIB::ldexpf128);
-  Info.setLibcallImpl(RTLIB::FREXP_F128, RTLIB::frexpf128);
-  Info.setLibcallImpl(RTLIB::MODF_F128, RTLIB::modff128);
-
-  if (FiniteOnlyFuncs) {
-    Info.setLibcallImpl(RTLIB::LOG_FINITE_F128, RTLIB::__logf128_finite);
-    Info.setLibcallImpl(RTLIB::LOG2_FINITE_F128, RTLIB::__log2f128_finite);
-    Info.setLibcallImpl(RTLIB::LOG10_FINITE_F128, RTLIB::__log10f128_finite);
-    Info.setLibcallImpl(RTLIB::EXP_FINITE_F128, RTLIB::__expf128_finite);
-    Info.setLibcallImpl(RTLIB::EXP2_FINITE_F128, RTLIB::__exp2f128_finite);
-    Info.setLibcallImpl(RTLIB::POW_FINITE_F128, RTLIB::__powf128_finite);
-  } else {
-    Info.setLibcallImpl(RTLIB::LOG_FINITE_F128, RTLIB::Unsupported);
-    Info.setLibcallImpl(RTLIB::LOG2_FINITE_F128, RTLIB::Unsupported);
-    Info.setLibcallImpl(RTLIB::LOG10_FINITE_F128, RTLIB::Unsupported);
-    Info.setLibcallImpl(RTLIB::EXP_FINITE_F128, RTLIB::Unsupported);
-    Info.setLibcallImpl(RTLIB::EXP2_FINITE_F128, RTLIB::Unsupported);
-    Info.setLibcallImpl(RTLIB::POW_FINITE_F128, RTLIB::Unsupported);
-  }
-}
-
 void RTLIB::RuntimeLibcallsInfo::initDefaultLibCallImpls() {
   std::memcpy(LibcallImpls, DefaultLibcallImpls, sizeof(LibcallImpls));
   static_assert(sizeof(LibcallImpls) == sizeof(DefaultLibcallImpls),
@@ -143,10 +80,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
   if (TT.isAMDGPU() || TT.isNVPTX() || TT.isWasm())
     return;
 
-  // Use the f128 variants of math functions on x86
-  if (TT.isX86() && TT.isGNUEnvironment())
-    setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/true);
-
   if (TT.isX86() || TT.isVE() || TT.isARM() || TT.isThumb()) {
     if (ExceptionModel == ExceptionHandling::SjLj)
       setLibcallImpl(RTLIB::UNWIND_RESUME, RTLIB::_Unwind_SjLj_Resume);
@@ -160,91 +93,33 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT,
     setLibcallImpl(RTLIB::FPEXT_F16_F32, RTLIB::__extendhfsf2);
     setLibcallImpl(RTLIB::FPROUND_F32_F16, RTLIB::__truncsfhf2);
 
-    // Some darwins have an optimized __bzero/bzero function.
-    if (TT.isX86()) {
-      if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
-        setLibcallImpl(RTLIB::BZERO, RTLIB::__bzero);
-    }
-
-    if (darwinHasSinCosStret(TT)) {
-      setLibcallImpl(RTLIB::SINCOS_STRET_F32, RTLIB::__sincosf_stret);
-      setLibcallImpl(RTLIB::SINCOS_STRET_F64, RTLIB::__sincos_stret);
-      if (TT.isWatchABI()) {
-        setLibcallImplCallingConv(RTLIB::__sincosf_stret,
-                                  CallingConv::ARM_AAPCS_VFP);
-        setLibcallImplCallingConv(RTLIB::__sincos_stret,
-                                  CallingConv::ARM_AAPCS_VFP);
-      }
-    }
-
-    if (darwinHasExp10(TT)) {
-      setLibcallImpl(RTLIB::EXP10_F32, RTLIB::__exp10f);
-      setLibcallImpl(RTLIB::EXP10_F64, RTLIB::__exp10);
-    } else {
+    if (!darwinHasExp10(TT)) {
       setLibcallImpl(RTLIB::EXP10_F32, RTLIB::Unsupported);
       setLibcallImpl(RTLIB::EXP10_F64, RTLIB::Unsupported);
     }
   }
 
-  if (hasSinCos(TT)) {
-    setLibcallImpl(RTLIB::SINCOS_F32, RTLIB::sincosf);
-    setLibcallImpl(RTLIB::SINCOS_F64, RTLIB::sincos);
-    setLibcallImpl(RTLIB::SINCOS_F80, RTLIB::sincos_f80);
-    setLibcallImpl(RTLIB::SINCOS_F128, RTLIB::sincos_f128);
-    setLibcallImpl(RTLIB::SINCOS_PPCF128, RTLIB::sincos_ppcf128);
-  }
-
-  if (TT.isPS()) {
-    setLibcallImpl(RTLIB::SINCOS_F32, RTLIB::sincosf);
-    setLibcallImpl(RTLIB::SINCOS_F64, RTLIB::sincos);
-  }
-
   if (TT.isOSOpenBSD()) {
     setLibcallImpl(RTLIB::STACKPROTECTOR_CHECK_FAIL, RTLIB::Unsupported);
+    setLibcallImpl(RTLIB::STACK_SMASH_HANDLER, RTLIB::__stack_smash_handler);
   }
 
-  if (TT.isOSWindows() && !TT.isOSCygMing()) {
-    setLibcallImpl(RTLIB::LDEXP_F32, RTLIB::Unsupported);
-    setLibcallImpl(RTLIB::LDEXP_F80, RTLIB::Unsupported);
-    setLibcallImpl(RTLIB::LDEXP_F128, RTLIB::Unsupported);
-    setLibcallImpl(RTLIB::LDEXP_PPCF128, RTLIB::Unsupported);
-
-    setLibcallImpl(RTLIB::FREXP_F32, RTLIB::Unsupported);
-    setLibcallImpl(RTLIB::FREXP_F80, RTLIB::Unsupported);
-    setLibcallImpl(RTLIB::FREXP_F128, RTLIB::Unsupported);
-    setLibcallImpl(RTLIB::FREXP_PPCF128, RTLIB::Unsupported);
-  }
+  // Skip default manual processing for targets that have been fully ported to
+  // tablegen for now. Eventually the rest of this should be deleted.
+  if (TT.isX86() || TT.isAArch64() || TT.isWasm())
+    return;
 
-  if (TT.isOSMSVCRT()) {
-    // MSVCRT doesn't have powi; fall back to pow
-    setLibcallImpl(RTLIB::POWI_F32, RTLIB::Unsupported);
-    setLibcallImpl(RTLIB::POWI_F64, RTLIB::Unsupported);
+  if (TT.isARM() || TT.isThumb()) {
+    setARMLibcallNames(*this, TT, FloatABI, EABIVersion);
+    return;
   }
 
-  // Setup Windows compiler runtime calls.
-  if (TT.getArch() == Triple::x86 &&
-      (TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment())) {
-    static const struct {
-      const RTLIB::Libcall Op;
-      const RTLIB::LibcallImpl Impl;
-      const CallingConv::ID CC;
-    } LibraryCalls[] = {
-        {RTLIB::SDIV_I64, RTLIB::_alldiv, CallingConv::X86_StdCall},
-        {RTLIB::UDIV_I64, RTLIB::_aulldiv, CallingConv::X86_StdCall},
-        {RTLIB::SREM_I64, RTLIB::_allrem, CallingConv::X86_StdCall},
-        {RTLIB::UREM_I64, RTLIB::_aullrem, CallingConv::X86_StdCall},
-        {RTLIB::MUL_I64, RTLIB::_allmul, CallingConv::X86_StdCall},
-    };
-
-    for (const auto &LC : LibraryCalls) {
-      setLibcallImpl(LC.Op, LC.Impl);
-      setLibcallImplCallingConv(LC.Impl, LC.CC);
-    }
+  if (hasSinCos(TT)) {
+    setLibcallImpl(RTLIB::SINCOS_F32, RTLIB::sincosf);
+    setLibcallImpl(RTLIB::SINCOS_F64, RTLIB::sincos);
+    setLibcallImpl(RTLIB::SINCOS_F128, RTLIB::sincos_f128);
   }
 
-  if (TT.isARM() || TT.isThumb())
-    setARMLibcallNames(*this, TT, FloatABI, EABIVersion);
-
   // These libcalls are only available in compiler-rt, not libgcc.
   if (TT.isArch64Bit()) {
     setLibcallImpl(RTLIB::SHL_I128, RTLIB::__ashlti3);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8004077b92665..8c8ed3c5e47ba 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -597,7 +597,6 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call);
   void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI);
   void visitVPIntrinsic(VPIntrinsic &VPI);
-  void visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII);
   void visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI);
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
   void visitAtomicRMWInst(AtomicRMWInst &RMWI);
@@ -636,15 +635,12 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void verifyFrameRecoverIndices();
   void verifySiblingFuncletUnwinds();
 
-  void verifyFragmentExpression(const DbgVariableIntrinsic &I);
   void verifyFragmentExpression(const DbgVariableRecord &I);
   template <typename ValueOrMetadata>
   void verifyFragmentExpression(const DIVariable &V,
                                 DIExpression::FragmentInfo Fragment,
                                 ValueOrMetadata *Desc);
-  void verifyFnArgs(const DbgVariableIntrinsic &I);
   void verifyFnArgs(const DbgVariableRecord &DVR);
-  void verifyNotEntryValue(const DbgVariableIntrinsic &I);
   void verifyNotEntryValue(const DbgVariableRecord &I);
 
   /// Module-level debug info verification...
@@ -3185,12 +3181,6 @@ void Verifier::visitFunction(const Function &F) {
     CheckDI(SP->describes(&F),
             "!dbg attachment points at wrong subprogram for function", N, &F,
             &I, DL, Scope, SP);
-
-    if (DL->getAtomGroup())
-      CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
-              "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
-              "Instructions enabled",
-              DL, DL->getScope()->getSubprogram());
   };
   for (auto &BB : F)
     for (auto &I : BB) {
@@ -5492,11 +5482,15 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
-  }
 
-  if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-    verifyFragmentExpression(*DII);
-    verifyNotEntryValue(*DII);
+    if (auto *DL = dyn_cast<DILocation>(N)) {
+      if (DL->getAtomGroup()) {
+        CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
+                "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
+                "Instructions enabled",
+                DL, DL->getScope()->getSubprogram());
+      }
+    }
   }
 
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
@@ -5703,18 +5697,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
     break;
   case Intrinsic::dbg_declare: // llvm.dbg.declare
-    Check(isa<MetadataAsValue>(Call.getArgOperand(0)),
-          "invalid llvm.dbg.declare intrinsic call 1", Call);
-    visitDbgIntrinsic("declare", cast<DbgVariableIntrinsic>(Call));
-    break;
-  case Intrinsic::dbg_value: // llvm.dbg.value
-    visitDbgIntrinsic("value", cast<DbgVariableIntrinsic>(Call));
-    break;
-  case Intrinsic::dbg_assign: // llvm.dbg.assign
-    visitDbgIntrinsic("assign", cast<DbgVariableIntrinsic>(Call));
-    break;
-  case Intrinsic::dbg_label: // llvm.dbg.label
-    visitDbgLabelIntrinsic("label", cast<DbgLabelInst>(Call));
+  case Intrinsic::dbg_value:   // llvm.dbg.value
+  case Intrinsic::dbg_assign:  // llvm.dbg.assign
+  case Intrinsic::dbg_label:   // llvm.dbg.label
+    // We no longer interpret debug intrinsics (the old variable-location
+    // design). They're meaningless as far as LLVM is concerned we could make
+    // it an error for them to appear, but it's possible we'll have users
+    // converting back to intrinsics for the forseeable future (such as DXIL),
+    // so tolerate their existance.
     break;
   case Intrinsic::memcpy:
   case Intrinsic::memcpy_inline:
@@ -6541,7 +6531,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Check(!Call.paramHasAttr(3, Attribute::InReg),
           "VGPR arguments must not have the `inreg` attribute", &Call);
 
-    auto *Next = Call.getNextNonDebugInstruction();
+    auto *Next = Call.getNextNode();
     bool IsAMDUnreachable = Next && isa<IntrinsicInst>(Next) &&
                             cast<IntrinsicInst>(Next)->getIntrinsicID() ==
                                 Intrinsic::amdgcn_unreachable;
@@ -7123,123 +7113,6 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   }
 }
 
-void Verifier::visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII) {
-  auto *MD = DII.getRawLocation();
-  CheckDI(isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
-              (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands()),
-          "invalid llvm.dbg." + Kind + " intrinsic address/value", &DII, MD);
-  CheckDI(isa<DILocalVariable>(DII.getRawVariable()),
-          "invalid llvm.dbg." + Kind + " intrinsic variable", &DII,
-          DII.getRawVariable());
-  CheckDI(isa<DIExpression>(DII.getRawExpression()),
-          "invalid llvm.dbg." + Kind + " intrinsic expression", &DII,
-          DII.getRawExpression());
-
-  if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&DII)) {
-    CheckDI(isa<DIAssignID>(DAI->getRawAssignID()),
-            "invalid llvm.dbg.assign intrinsic DIAssignID", &DII,
-            DAI->getRawAssignID());
-    const auto *RawAddr = DAI->getRawAddress();
-    CheckDI(
-        isa<ValueAsMetadata>(RawAddr) ||
-            (isa<MDNode>(RawAddr) && !cast<MDNode>(RawAddr)->getNumOperands()),
-        "invalid llvm.dbg.assign intrinsic address", &DII,
-        DAI->getRawAddress());
-    CheckDI(isa<DIExpression>(DAI->getRawAddressExpression()),
-            "invalid llvm.dbg.assign intrinsic address expression", &DII,
-            DAI->getRawAddressExpression());
-    // All of the linked instructions should be in the same function as DII.
-    for (Instruction *I : at::getAssignmentInsts(DAI))
-      CheckDI(DAI->getFunction() == I->getFunction(),
-              "inst not in same function as dbg.assign", I, DAI);
-  }
-
-  // Ignore broken !dbg attachments; they're checked elsewhere.
-  if (MDNode *N = DII.getDebugLoc().getAsMDNode())
-    if (!isa<DILocation>(N))
-      return;
-
-  BasicBlock *BB = DII.getParent();
-  Function *F = BB ? BB->getParent() : nullptr;
-
-  // The scopes for variables and !dbg attachments must agree.
-  DILocalVariable *Var = DII.getVariable();
-  DILocation *Loc = DII.getDebugLoc();
-  CheckDI(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment",
-          &DII, BB, F);
-
-  DISubprogram *VarSP = getSubprogram(Var->getRawScope());
-  DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
-  if (!VarSP || !LocSP)
-    return; // Broken scope chains are checked elsewhere.
-
-  CheckDI(VarSP == LocSP,
-          "mismatched subprogram between llvm.dbg." + Kind +
-              " variable and !dbg attachment",
-          &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
-          Loc->getScope()->getSubprogram());
-
-  // This check is redundant with one in visitLocalVariable().
-  CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
-          Var->getRawType());
-  verifyFnArgs(DII);
-}
-
-void Verifier::visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI) {
-  CheckDI(isa<DILabel>(DLI.getRawLabel()),
-          "invalid llvm.dbg." + Kind + " intrinsic variable", &DLI,
-          DLI.getRawLabel());
-
-  // Ignore broken !dbg attachments; they're checked elsewhere.
-  if (MDNode *N = DLI.getDebugLoc().getAsMDNode())
-    if (!isa<DILocation>(N))
-      return;
-
-  BasicBlock *BB = DLI.getParent();
-  Function *F = BB ? BB->getParent() : nullptr;
-
-  // The scopes for variables and !dbg attachments must agree.
-  DILabel *Label = DLI.getLabel();
-  DILocation *Loc = DLI.getDebugLoc();
-  Check(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment", &DLI,
-        BB, F);
-
-  DISubprogram *LabelSP = getSubprogram(Label->getRawScope());
-  DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
-  if (!LabelSP || !LocSP)
-    return;
-
-  CheckDI(LabelSP == LocSP,
-          "mismatched subprogram between llvm.dbg." + Kind +
-              " label and !dbg attachment",
-          &DLI, BB, F, Label, Label->getScope()->getSubprogram(), Loc,
-          Loc->getScope()->getSubprogram());
-}
-
-void Verifier::verifyFragmentExpression(const DbgVariableIntrinsic &I) {
-  DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(I.getRawVariable());
-  DIExpression *E = dyn_cast_or_null<DIExpression>(I.getRawExpression());
-
-  // We don't know whether this intrinsic verified correctly.
-  if (!V || !E || !E->isValid())
-    return;
-
-  // Nothing to do if this isn't a DW_OP_LLVM_fragment expression.
-  auto Fragment = E->getFragmentInfo();
-  if (!Fragment)
-    return;
-
-  // The frontend helps out GDB by emitting the members of local anonymous
-  // unions as artificial local variables with shared storage. When SROA splits
-  // the storage for artificial local variables that are smaller than the entire
-  // union, the overhang piece will be outside of the allotted space for the
-  // variable and this check fails.
-  // FIXME: Remove this check as soon as clang stops doing this; it hides bugs.
-  if (V->isArtificial())
-    return;
-
-  verifyFragmentExpression(*V, *Fragment, &I);
-}
 void Verifier::verifyFragmentExpression(const DbgVariableRecord &DVR) {
   DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(DVR.getRawVariable());
   DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
@@ -7282,34 +7155,6 @@ void Verifier::verifyFragmentExpression(const DIVariable &V,
   CheckDI(FragSize != *VarSize, "fragment covers entire variable", Desc, &V);
 }
 
-void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
-  // This function does not take the scope of noninlined function arguments into
-  // account. Don't run it if current function is nodebug, because it may
-  // contain inlined debug intrinsics.
-  if (!HasDebugInfo)
-    return;
-
-  // For performance reasons only check non-inlined ones.
-  if (I.getDebugLoc()->getInlinedAt())
-    return;
-
-  DILocalVariable *Var = I.getVariable();
-  CheckDI(Var, "dbg intrinsic without variable");
-
-  unsigned ArgNo = Var->getArg();
-  if (!ArgNo)
-    return;
-
-  // Verify there are no duplicate function argument debug info entries.
-  // These will cause hard-to-debug assertions in the DWARF backend.
-  if (DebugFnArgs.size() < ArgNo)
-    DebugFnArgs.resize(ArgNo, nullptr);
-
-  auto *Prev = DebugFnArgs[ArgNo - 1];
-  DebugFnArgs[ArgNo - 1] = Var;
-  CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
-          Prev, Var);
-}
 void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) {
   // This function does not take the scope of noninlined function arguments into
   // account. Don't run it if current function is nodebug, because it may
@@ -7339,29 +7184,6 @@ void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) {
           Prev, Var);
 }
 
-void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) {
-  DIExpression *E = dyn_cast_or_null<DIExpression>(I.getRawExpression());
-
-  // We don't know whether this intrinsic verified correctly.
-  if (!E || !E->isValid())
-    return;
-
-  if (isa<ValueAsMetadata>(I.getRawLocation())) {
-    Value *VarValue = I.getVariableLocationOp(0);
-    if (isa<UndefValue>(VarValue) || isa<PoisonValue>(VarValue))
-      return;
-    // We allow EntryValues for swift async arguments, as they have an
-    // ABI-guarantee to be turned into a specific register.
-    if (auto *ArgLoc = dyn_cast_or_null<Argument>(VarValue);
-        ArgLoc && ArgLoc->hasAttribute(Attribute::SwiftAsync))
-      return;
-  }
-
-  CheckDI(!E->isEntryValue(),
-          "Entry values are only allowed in MIR unless they target a "
-          "swiftasync Argument",
-          &I);
-}
 void Verifier::verifyNotEntryValue(const DbgVariableRecord &DVR) {
   DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
 
diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp
index 39ef521031069..828d9cf56a71f 100644
--- a/llvm/lib/MC/MCAsmBackend.cpp
+++ b/llvm/lib/MC/MCAsmBackend.cpp
@@ -105,7 +105,8 @@ MCFixupKindInfo MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Builtins[Kind - FK_NONE];
 }
 
-bool MCAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool MCAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+                                                const MCFixup &Fixup,
                                                 const MCValue &, uint64_t Value,
                                                 bool Resolved) const {
   if (!Resolved)
@@ -138,9 +139,7 @@ bool MCAsmBackend::isDarwinCanonicalPersonality(const MCSymbol *Sym) const {
 
 const MCSubtargetInfo *MCAsmBackend::getSubtargetInfo(const MCFragment &F) {
   const MCSubtargetInfo *STI = nullptr;
-  if (auto *DF = dyn_cast<MCEncodedFragment>(&F)) {
-    STI = DF->getSubtargetInfo();
-    assert(!DF->hasInstructions() || STI != nullptr);
-  }
+  STI = F.getSubtargetInfo();
+  assert(!F.hasInstructions() || STI != nullptr);
   return STI;
 }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index c89dc549d8e34..67c53e01a6111 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -407,13 +407,8 @@ class MCAsmStreamer final : public MCStreamer {
                        const MCPseudoProbeInlineStack &InlineStack,
                        MCSymbol *FnSym) override;
 
-  void emitBundleAlignMode(Align Alignment) override;
-  void emitBundleLock(bool AlignToEnd) override;
-  void emitBundleUnlock() override;
-
-  std::optional<std::pair<bool, std::string>>
-  emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr,
-                     SMLoc Loc, const MCSubtargetInfo &STI) override;
+  void emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                          const MCExpr *Expr, SMLoc Loc) override;
 
   void emitAddrsig() override;
   void emitAddrsigSym(const MCSymbol *Sym) override;
@@ -2466,32 +2461,14 @@ void MCAsmStreamer::emitPseudoProbe(uint64_t Guid, uint64_t Index,
   for (const auto &Site : InlineStack)
     OS << " @ " << std::get<0>(Site) << ":" << std::get<1>(Site);
 
-  OS << " " << FnSym->getName();
-
-  EmitEOL();
-}
-
-void MCAsmStreamer::emitBundleAlignMode(Align Alignment) {
-  OS << "\t.bundle_align_mode " << Log2(Alignment);
-  EmitEOL();
-}
-
-void MCAsmStreamer::emitBundleLock(bool AlignToEnd) {
-  OS << "\t.bundle_lock";
-  if (AlignToEnd)
-    OS << " align_to_end";
-  EmitEOL();
-}
+  OS << " ";
+  FnSym->print(OS, MAI);
 
-void MCAsmStreamer::emitBundleUnlock() {
-  OS << "\t.bundle_unlock";
   EmitEOL();
 }
 
-std::optional<std::pair<bool, std::string>>
-MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                  const MCExpr *Expr, SMLoc,
-                                  const MCSubtargetInfo &STI) {
+void MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                       const MCExpr *Expr, SMLoc) {
   OS << "\t.reloc ";
   MAI->printExpr(OS, Offset);
   OS << ", " << Name;
@@ -2500,7 +2477,6 @@ MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
     MAI->printExpr(OS, *Expr);
   }
   EmitEOL();
-  return std::nullopt;
 }
 
 void MCAsmStreamer::emitAddrsig() {
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index fd8a8c3a79c9f..3e96bdf5169d8 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -94,7 +94,6 @@ void MCAssembler::reset() {
   Sections.clear();
   Symbols.clear();
   ThumbFuncs.clear();
-  BundleAlignSize = 0;
 
   // reset objects owned by us
   if (getBackendPtr())
@@ -203,7 +202,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
   case MCFragment::FT_CVInlineLines:
   case MCFragment::FT_CVDefRange:
   case MCFragment::FT_PseudoProbe:
-    return cast<MCEncodedFragment>(F).getContents().size();
+    return F.getSize();
   case MCFragment::FT_Fill: {
     auto &FF = cast<MCFillFragment>(F);
     int64_t NumValues = 0;
@@ -282,87 +281,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
   llvm_unreachable("invalid fragment kind");
 }
 
-// Compute the amount of padding required before the fragment \p F to
-// obey bundling restrictions, where \p FOffset is the fragment's offset in
-// its section and \p FSize is the fragment's size.
-static uint64_t computeBundlePadding(unsigned BundleSize,
-                                     const MCEncodedFragment *F,
-                                     uint64_t FOffset, uint64_t FSize) {
-  uint64_t OffsetInBundle = FOffset & (BundleSize - 1);
-  uint64_t EndOfFragment = OffsetInBundle + FSize;
-
-  // There are two kinds of bundling restrictions:
-  //
-  // 1) For alignToBundleEnd(), add padding to ensure that the fragment will
-  //    *end* on a bundle boundary.
-  // 2) Otherwise, check if the fragment would cross a bundle boundary. If it
-  //    would, add padding until the end of the bundle so that the fragment
-  //    will start in a new one.
-  if (F->alignToBundleEnd()) {
-    // Three possibilities here:
-    //
-    // A) The fragment just happens to end at a bundle boundary, so we're good.
-    // B) The fragment ends before the current bundle boundary: pad it just
-    //    enough to reach the boundary.
-    // C) The fragment ends after the current bundle boundary: pad it until it
-    //    reaches the end of the next bundle boundary.
-    //
-    // Note: this code could be made shorter with some modulo trickery, but it's
-    // intentionally kept in its more explicit form for simplicity.
-    if (EndOfFragment == BundleSize)
-      return 0;
-    else if (EndOfFragment < BundleSize)
-      return BundleSize - EndOfFragment;
-    else { // EndOfFragment > BundleSize
-      return 2 * BundleSize - EndOfFragment;
-    }
-  } else if (OffsetInBundle > 0 && EndOfFragment > BundleSize)
-    return BundleSize - OffsetInBundle;
-  else
-    return 0;
-}
-
-void MCAssembler::layoutBundle(MCFragment *Prev, MCFragment *F) const {
-  // If bundling is enabled and this fragment has instructions in it, it has to
-  // obey the bundling restrictions. With padding, we'll have:
-  //
-  //
-  //        BundlePadding
-  //             |||
-  // -------------------------------------
-  //   Prev  |##########|       F        |
-  // -------------------------------------
-  //                    ^
-  //                    |
-  //                    F->Offset
-  //
-  // The fragment's offset will point to after the padding, and its computed
-  // size won't include the padding.
-  //
-  // ".align N" is an example of a directive that introduces multiple
-  // fragments. We could add a special case to handle ".align N" by emitting
-  // within-fragment padding (which would produce less padding when N is less
-  // than the bundle size), but for now we don't.
-  //
-  assert(isa<MCEncodedFragment>(F) &&
-         "Only MCEncodedFragment implementations have instructions");
-  MCEncodedFragment *EF = cast<MCEncodedFragment>(F);
-  uint64_t FSize = computeFragmentSize(*EF);
-
-  if (FSize > getBundleAlignSize())
-    report_fatal_error("Fragment can't be larger than a bundle size");
-
-  uint64_t RequiredBundlePadding =
-      computeBundlePadding(getBundleAlignSize(), EF, EF->Offset, FSize);
-  if (RequiredBundlePadding > UINT8_MAX)
-    report_fatal_error("Padding cannot exceed 255 bytes");
-  EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
-  EF->Offset += RequiredBundlePadding;
-  if (auto *DF = dyn_cast_or_null<MCDataFragment>(Prev))
-    if (DF->getContents().empty())
-      DF->Offset = EF->Offset;
-}
-
 // Simple getSymbolOffset helper for the non-variable case.
 static bool getLabelOffset(const MCAssembler &Asm, const MCSymbol &S,
                            bool ReportError, uint64_t &Val) {
@@ -480,39 +398,8 @@ bool MCAssembler::registerSymbol(const MCSymbol &Symbol) {
   return Changed;
 }
 
-void MCAssembler::writeFragmentPadding(raw_ostream &OS,
-                                       const MCEncodedFragment &EF,
-                                       uint64_t FSize) const {
-  assert(getBackendPtr() && "Expected assembler backend");
-  // Should NOP padding be written out before this fragment?
-  unsigned BundlePadding = EF.getBundlePadding();
-  if (BundlePadding > 0) {
-    assert(isBundlingEnabled() &&
-           "Writing bundle padding with disabled bundling");
-    assert(EF.hasInstructions() &&
-           "Writing bundle padding for a fragment without instructions");
-
-    unsigned TotalLength = BundlePadding + static_cast<unsigned>(FSize);
-    const MCSubtargetInfo *STI = EF.getSubtargetInfo();
-    if (EF.alignToBundleEnd() && TotalLength > getBundleAlignSize()) {
-      // If the padding itself crosses a bundle boundary, it must be emitted
-      // in 2 pieces, since even nop instructions must not cross boundaries.
-      //             v--------------v   <- BundleAlignSize
-      //        v---------v             <- BundlePadding
-      // ----------------------------
-      // | Prev |####|####|    F    |
-      // ----------------------------
-      //        ^-------------------^   <- TotalLength
-      unsigned DistanceToBoundary = TotalLength - getBundleAlignSize();
-      if (!getBackend().writeNopData(OS, DistanceToBoundary, STI))
-        report_fatal_error("unable to write NOP sequence of " +
-                           Twine(DistanceToBoundary) + " bytes");
-      BundlePadding -= DistanceToBoundary;
-    }
-    if (!getBackend().writeNopData(OS, BundlePadding, STI))
-      report_fatal_error("unable to write NOP sequence of " +
-                         Twine(BundlePadding) + " bytes");
-  }
+void MCAssembler::addRelocDirective(RelocDirective RD) {
+  relocDirectives.push_back(RD);
 }
 
 /// Write the fragment \p F to the output file.
@@ -523,9 +410,6 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
 
   llvm::endianness Endian = Asm.getBackend().Endian;
 
-  if (const MCEncodedFragment *EF = dyn_cast<MCEncodedFragment>(&F))
-    Asm.writeFragmentPadding(OS, *EF, FragmentSize);
-
   // This variable (and its dummy usage) is to participate in the assert at
   // the end of the function.
   uint64_t Start = OS.tell();
@@ -546,8 +430,9 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
       ++stats::EmittedDataFragments;
     else if (F.getKind() == MCFragment::FT_Relaxable)
       ++stats::EmittedRelaxableFragments;
-    const auto &EF = cast<MCEncodedFragment>(F);
+    const auto &EF = cast<MCFragment>(F);
     OS << StringRef(EF.getContents().data(), EF.getContents().size());
+    OS << StringRef(EF.getVarContents().data(), EF.getVarContents().size());
     break;
   }
   case MCFragment::FT_Align: {
@@ -712,11 +597,10 @@ void MCAssembler::writeSectionData(raw_ostream &OS,
         // Check that we aren't trying to write a non-zero contents (or fixups)
         // into a virtual section. This is to support clients which use standard
         // directives to fill the contents of virtual sections.
-        const MCDataFragment &DF = cast<MCDataFragment>(F);
-        if (DF.getFixups().size())
+        if (F.getFixups().size() || F.getVarFixups().size())
           reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
                                    Sec->getName() + "' cannot have fixups");
-        for (char C : DF.getContents())
+        for (char C : F.getContents())
           if (C) {
             reportError(SMLoc(), Sec->getVirtualSectionKind() + " section '" +
                                      Sec->getName() +
@@ -769,7 +653,7 @@ void MCAssembler::layout() {
 
     // Chain together fragments from all subsections.
     if (Sec.Subsections.size() > 1) {
-      MCDataFragment Dummy;
+      MCFragment Dummy;
       MCFragment *Tail = &Dummy;
       for (auto &[_, List] : Sec.Subsections) {
         assert(List.Head);
@@ -815,19 +699,55 @@ void MCAssembler::layout() {
   // helps check whether a PC-relative fixup is fully resolved.
   this->HasFinalLayout = true;
 
+  // Resolve .reloc offsets and add fixups.
+  for (auto &PF : relocDirectives) {
+    MCValue Res;
+    auto &O = PF.Offset;
+    if (!O.evaluateAsValue(Res, *this)) {
+      getContext().reportError(O.getLoc(), ".reloc offset is not relocatable");
+      continue;
+    }
+    auto *Sym = Res.getAddSym();
+    auto *F = Sym ? Sym->getFragment() : nullptr;
+    auto *Sec = F ? F->getParent() : nullptr;
+    if (Res.getSubSym() || !Sec) {
+      getContext().reportError(O.getLoc(),
+                               ".reloc offset is not relative to a section");
+      continue;
+    }
+
+    uint64_t Offset = Sym ? Sym->getOffset() + Res.getConstant() : 0;
+    F->addFixup(MCFixup::create(Offset, PF.Expr, PF.Kind));
+  }
+
   // Evaluate and apply the fixups, generating relocation entries as necessary.
   for (MCSection &Sec : *this) {
-    for (MCFragment &Frag : Sec) {
+    for (MCFragment &F : Sec) {
       // Process fragments with fixups here.
-      if (auto *F = dyn_cast<MCEncodedFragment>(&Frag)) {
-        auto Contents = F->getContents();
-        for (MCFixup &Fixup : F->getFixups()) {
+      if (F.isEncoded()) {
+        auto Contents = F.getContents();
+        for (MCFixup &Fixup : F.getFixups()) {
           uint64_t FixedValue;
           MCValue Target;
-          evaluateFixup(Frag, Fixup, Target, FixedValue,
+          evaluateFixup(F, Fixup, Target, FixedValue,
                         /*RecordReloc=*/true, Contents);
         }
-      } else if (auto *AF = dyn_cast<MCAlignFragment>(&Frag)) {
+        // In the variable part, fixup offsets are relative to the fixed part's
+        // start. Extend the variable contents to the left to account for the
+        // fixed part size.
+        auto VarFixups = F.getVarFixups();
+        if (VarFixups.size()) {
+          Contents =
+              MutableArrayRef(F.getParent()->ContentStorage)
+                  .slice(F.VarContentStart - Contents.size(), F.getSize());
+          for (MCFixup &Fixup : VarFixups) {
+            uint64_t FixedValue;
+            MCValue Target;
+            evaluateFixup(F, Fixup, Target, FixedValue,
+                          /*RecordReloc=*/true, Contents);
+          }
+        }
+      } else if (auto *AF = dyn_cast<MCAlignFragment>(&F)) {
         // For RISC-V linker relaxation, an alignment relocation might be
         // needed.
         if (AF->hasEmitNops())
@@ -847,18 +767,18 @@ void MCAssembler::Finish() {
   assert(PendingErrors.empty());
 }
 
-bool MCAssembler::fixupNeedsRelaxation(const MCRelaxableFragment &F,
+bool MCAssembler::fixupNeedsRelaxation(const MCFragment &F,
                                        const MCFixup &Fixup) const {
   assert(getBackendPtr() && "Expected assembler backend");
   MCValue Target;
   uint64_t Value;
   bool Resolved = evaluateFixup(F, const_cast<MCFixup &>(Fixup), Target, Value,
                                 /*RecordReloc=*/false, {});
-  return getBackend().fixupNeedsRelaxationAdvanced(Fixup, Target, Value,
+  return getBackend().fixupNeedsRelaxationAdvanced(F, Fixup, Target, Value,
                                                    Resolved);
 }
 
-bool MCAssembler::relaxInstruction(MCRelaxableFragment &F) {
+bool MCAssembler::relaxInstruction(MCFragment &F) {
   assert(getEmitterPtr() &&
          "Expected CodeEmitter defined for relaxInstruction");
   // If this inst doesn't ever need relaxation, ignore it. This occurs when we
@@ -869,7 +789,7 @@ bool MCAssembler::relaxInstruction(MCRelaxableFragment &F) {
     return false;
 
   bool DoRelax = false;
-  for (const MCFixup &Fixup : F.getFixups())
+  for (const MCFixup &Fixup : F.getVarFixups())
     if ((DoRelax = fixupNeedsRelaxation(F, Fixup)))
       break;
   if (!DoRelax)
@@ -877,7 +797,7 @@ bool MCAssembler::relaxInstruction(MCRelaxableFragment &F) {
 
   ++stats::RelaxedInstructions;
 
-  // TODO Refactor relaxInstruction to accept MCRelaxableFragment and remove
+  // TODO Refactor relaxInstruction to accept MCFragment and remove
   // `setInst`.
   MCInst Relaxed = F.getInst();
   getBackend().relaxInstruction(Relaxed, *F.getSubtargetInfo());
@@ -887,30 +807,30 @@ bool MCAssembler::relaxInstruction(MCRelaxableFragment &F) {
   SmallVector<char, 16> Data;
   SmallVector<MCFixup, 1> Fixups;
   getEmitter().encodeInstruction(Relaxed, Data, Fixups, *F.getSubtargetInfo());
-  F.setContents(Data);
-  F.setFixups(Fixups);
+  F.setVarContents(Data);
+  F.setVarFixups(Fixups);
   return true;
 }
 
-bool MCAssembler::relaxLEB(MCLEBFragment &LF) {
-  const unsigned OldSize = static_cast<unsigned>(LF.getContents().size());
+bool MCAssembler::relaxLEB(MCFragment &F) {
+  const unsigned OldSize = F.getVarSize();
   unsigned PadTo = OldSize;
   int64_t Value;
-  LF.clearFixups();
+  F.clearVarFixups();
   // Use evaluateKnownAbsolute for Mach-O as a hack: .subsections_via_symbols
   // requires that .uleb128 A-B is foldable where A and B reside in different
   // fragments. This is used by __gcc_except_table.
   bool Abs = getWriter().getSubsectionsViaSymbols()
-                 ? LF.getValue().evaluateKnownAbsolute(Value, *this)
-                 : LF.getValue().evaluateAsAbsolute(Value, *this);
+                 ? F.getLEBValue().evaluateKnownAbsolute(Value, *this)
+                 : F.getLEBValue().evaluateAsAbsolute(Value, *this);
   if (!Abs) {
     bool Relaxed, UseZeroPad;
-    std::tie(Relaxed, UseZeroPad) = getBackend().relaxLEB128(LF, Value);
+    std::tie(Relaxed, UseZeroPad) = getBackend().relaxLEB128(F, Value);
     if (!Relaxed) {
-      reportError(LF.getValue().getLoc(),
-                  Twine(LF.isSigned() ? ".s" : ".u") +
+      reportError(F.getLEBValue().getLoc(),
+                  Twine(F.isLEBSigned() ? ".s" : ".u") +
                       "leb128 expression is not absolute");
-      LF.setValue(MCConstantExpr::create(0, Context));
+      F.setLEBValue(MCConstantExpr::create(0, Context));
     }
     uint8_t Tmp[10]; // maximum size: ceil(64/7)
     PadTo = std::max(PadTo, encodeULEB128(uint64_t(Value), Tmp));
@@ -923,11 +843,11 @@ bool MCAssembler::relaxLEB(MCLEBFragment &LF) {
   // without either adding padding to an LEB fragment or adding extra padding
   // to a later alignment fragment. To accommodate such tables, relaxation can
   // only increase an LEB fragment size here, not decrease it. See PR35809.
-  if (LF.isSigned())
+  if (F.isLEBSigned())
     Size = encodeSLEB128(Value, Data, PadTo);
   else
     Size = encodeULEB128(Value, Data, PadTo);
-  LF.setContents({reinterpret_cast<char *>(Data), Size});
+  F.setVarContents({reinterpret_cast<char *>(Data), Size});
   return OldSize != Size;
 }
 
@@ -992,48 +912,45 @@ bool MCAssembler::relaxBoundaryAlign(MCBoundaryAlignFragment &BF) {
   return true;
 }
 
-bool MCAssembler::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF) {
+bool MCAssembler::relaxDwarfLineAddr(MCFragment &F) {
   bool WasRelaxed;
-  if (getBackend().relaxDwarfLineAddr(DF, WasRelaxed))
+  if (getBackend().relaxDwarfLineAddr(F, WasRelaxed))
     return WasRelaxed;
 
   MCContext &Context = getContext();
-  auto OldSize = DF.getContents().size();
+  auto OldSize = F.getVarSize();
   int64_t AddrDelta;
-  bool Abs = DF.getAddrDelta().evaluateKnownAbsolute(AddrDelta, *this);
+  bool Abs = F.getDwarfAddrDelta().evaluateKnownAbsolute(AddrDelta, *this);
   assert(Abs && "We created a line delta with an invalid expression");
   (void)Abs;
-  int64_t LineDelta;
-  LineDelta = DF.getLineDelta();
   SmallVector<char, 8> Data;
-
-  MCDwarfLineAddr::encode(Context, getDWARFLinetableParams(), LineDelta,
-                          AddrDelta, Data);
-  DF.setContents(Data);
-  DF.clearFixups();
+  MCDwarfLineAddr::encode(Context, getDWARFLinetableParams(),
+                          F.getDwarfLineDelta(), AddrDelta, Data);
+  F.setVarContents(Data);
+  F.clearVarFixups();
   return OldSize != Data.size();
 }
 
-bool MCAssembler::relaxDwarfCallFrameFragment(MCDwarfCallFrameFragment &DF) {
+bool MCAssembler::relaxDwarfCallFrameFragment(MCFragment &F) {
   bool WasRelaxed;
-  if (getBackend().relaxDwarfCFA(DF, WasRelaxed))
+  if (getBackend().relaxDwarfCFA(F, WasRelaxed))
     return WasRelaxed;
 
   MCContext &Context = getContext();
   int64_t Value;
-  bool Abs = DF.getAddrDelta().evaluateAsAbsolute(Value, *this);
+  bool Abs = F.getDwarfAddrDelta().evaluateAsAbsolute(Value, *this);
   if (!Abs) {
-    reportError(DF.getAddrDelta().getLoc(),
+    reportError(F.getDwarfAddrDelta().getLoc(),
                 "invalid CFI advance_loc expression");
-    DF.setAddrDelta(MCConstantExpr::create(0, Context));
+    F.setDwarfAddrDelta(MCConstantExpr::create(0, Context));
     return false;
   }
 
-  auto OldSize = DF.getContents().size();
+  auto OldSize = F.getVarContents().size();
   SmallVector<char, 8> Data;
   MCDwarfFrameEmitter::encodeAdvanceLoc(Context, Value, Data);
-  DF.setContents(Data);
-  DF.clearFixups();
+  F.setVarContents(Data);
+  F.clearVarFixups();
   return OldSize != Data.size();
 }
 
@@ -1078,15 +995,14 @@ bool MCAssembler::relaxFragment(MCFragment &F) {
   default:
     return false;
   case MCFragment::FT_Relaxable:
-    assert(!getRelaxAll() &&
-           "Did not expect a MCRelaxableFragment in RelaxAll mode");
-    return relaxInstruction(cast<MCRelaxableFragment>(F));
+    assert(!getRelaxAll() && "Did not expect a FT_Relaxable in RelaxAll mode");
+    return relaxInstruction(F);
+  case MCFragment::FT_LEB:
+    return relaxLEB(F);
   case MCFragment::FT_Dwarf:
-    return relaxDwarfLineAddr(cast<MCDwarfLineAddrFragment>(F));
+    return relaxDwarfLineAddr(F);
   case MCFragment::FT_DwarfFrame:
-    return relaxDwarfCallFrameFragment(cast<MCDwarfCallFrameFragment>(F));
-  case MCFragment::FT_LEB:
-    return relaxLEB(cast<MCLEBFragment>(F));
+    return relaxDwarfCallFrameFragment(F);
   case MCFragment::FT_BoundaryAlign:
     return relaxBoundaryAlign(cast<MCBoundaryAlignFragment>(F));
   case MCFragment::FT_CVInlineLines:
@@ -1101,17 +1017,9 @@ bool MCAssembler::relaxFragment(MCFragment &F) {
 }
 
 void MCAssembler::layoutSection(MCSection &Sec) {
-  MCFragment *Prev = nullptr;
   uint64_t Offset = 0;
   for (MCFragment &F : Sec) {
     F.Offset = Offset;
-    if (LLVM_UNLIKELY(isBundlingEnabled())) {
-      if (F.hasInstructions()) {
-        layoutBundle(Prev, &F);
-        Offset = F.Offset;
-      }
-      Prev = &F;
-    }
     Offset += computeFragmentSize(F);
   }
 }
diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp
index 5d7914396e09f..1f9825185175a 100644
--- a/llvm/lib/MC/MCCodeView.cpp
+++ b/llvm/lib/MC/MCCodeView.cpp
@@ -166,7 +166,7 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
   // somewhere else. If somebody wants two string tables in their .s file, one
   // will just be empty.
   if (!StrTabFragment) {
-    StrTabFragment = Ctx.allocFragment<MCDataFragment>();
+    StrTabFragment = Ctx.allocFragment<MCFragment>();
     OS.insert(StrTabFragment);
   }
 
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index b0d1cb41fac1c..12b3fbab8fb8f 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -87,9 +87,10 @@ MCContext::MCContext(const Triple &TheTriple, const MCAsmInfo *mai,
     Env = IsMachO;
     break;
   case Triple::COFF:
-    if (!TheTriple.isOSWindows() && !TheTriple.isUEFI())
-      report_fatal_error(
-          "Cannot initialize MC for non-Windows COFF object files.");
+    if (!TheTriple.isOSWindows() && !TheTriple.isUEFI()) {
+      reportFatalUsageError(
+          "cannot initialize MC for non-Windows COFF object files");
+    }
 
     Env = IsCOFF;
     break;
@@ -199,10 +200,10 @@ MCInst *MCContext::createMCInst() {
   return new (MCInstAllocator.Allocate()) MCInst;
 }
 
-// Allocate the initial MCDataFragment for the begin symbol.
-MCDataFragment *MCContext::allocInitialFragment(MCSection &Sec) {
+// Allocate the initial MCFragment for the begin symbol.
+MCFragment *MCContext::allocInitialFragment(MCSection &Sec) {
   assert(!Sec.curFragList()->Head);
-  auto *F = allocFragment<MCDataFragment>();
+  auto *F = allocFragment<MCFragment>();
   F->setParent(&Sec);
   Sec.curFragList()->Head = F;
   Sec.curFragList()->Tail = F;
@@ -733,9 +734,8 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name,
       UniqueName.append("/").append(P->getName());
   }
   // Do the lookup. If we don't have a hit, return a new section.
-  auto IterBool = GOFFUniquingMap.insert(std::make_pair(UniqueName, nullptr));
-  auto Iter = IterBool.first;
-  if (!IterBool.second)
+  auto [Iter, Inserted] = GOFFUniquingMap.try_emplace(UniqueName);
+  if (!Inserted)
     return Iter->second;
 
   StringRef CachedName = StringRef(Iter->first.c_str(), Name.size());
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 4bc3f4642ea02..49071bdec3dbd 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -48,10 +48,6 @@ ELFObjectWriter &MCELFStreamer::getWriter() {
   return static_cast<ELFObjectWriter &>(getAssembler().getWriter());
 }
 
-bool MCELFStreamer::isBundleLocked() const {
-  return getCurrentSectionOnly()->isBundleLocked();
-}
-
 void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
   MCContext &Ctx = getContext();
   switchSection(Ctx.getObjectFileInfo()->getTextSection());
@@ -72,7 +68,7 @@ void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
     Symbol->setType(ELF::STT_TLS);
 }
 
-void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCDataFragment &F,
+void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F,
                                    uint64_t Offset) {
   auto *Symbol = cast<MCSymbolELF>(S);
   MCObjectStreamer::emitLabelAtPos(Symbol, Loc, F, Offset);
@@ -83,23 +79,8 @@ void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCDataFragment &F,
     Symbol->setType(ELF::STT_TLS);
 }
 
-// If bundle alignment is used and there are any instructions in the section, it
-// needs to be aligned to at least the bundle size.
-static void setSectionAlignmentForBundling(const MCAssembler &Assembler,
-                                           MCSection *Section) {
-  if (Assembler.isBundlingEnabled() && Section->hasInstructions())
-    Section->ensureMinAlignment(Align(Assembler.getBundleAlignSize()));
-}
-
 void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   MCAssembler &Asm = getAssembler();
-  if (auto *F = getCurrentFragment()) {
-    if (isBundleLocked())
-      report_fatal_error("Unterminated .bundle_lock when changing a section");
-
-    // Ensure the previous section gets aligned if necessary.
-    setSectionAlignmentForBundling(Asm, F->getParent());
-  }
   auto *SectionELF = static_cast<const MCSectionELF *>(Section);
   const MCSymbol *Grp = SectionELF->getGroup();
   if (Grp)
@@ -313,22 +294,6 @@ void MCELFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
   emitCommonSymbol(Symbol, Size, ByteAlignment);
 }
 
-void MCELFStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
-                                  SMLoc Loc) {
-  if (isBundleLocked())
-    report_fatal_error("Emitting values inside a locked bundle is forbidden");
-  MCObjectStreamer::emitValueImpl(Value, Size, Loc);
-}
-
-void MCELFStreamer::emitValueToAlignment(Align Alignment, int64_t Value,
-                                         uint8_t ValueSize,
-                                         unsigned MaxBytesToEmit) {
-  if (isBundleLocked())
-    report_fatal_error("Emitting values inside a locked bundle is forbidden");
-  MCObjectStreamer::emitValueToAlignment(Alignment, Value, ValueSize,
-                                         MaxBytesToEmit);
-}
-
 void MCELFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
                                        const MCSymbolRefExpr *To,
                                        uint64_t Count) {
@@ -349,8 +314,9 @@ void MCELFStreamer::emitIdent(StringRef IdentString) {
   popSection();
 }
 
-void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE,
-                                           uint64_t Offset) {
+void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *Sym,
+                                           uint64_t Offset,
+                                           const MCSymbolRefExpr *&SRE) {
   const MCSymbol *S = &SRE->getSymbol();
   if (S->isTemporary()) {
     if (!S->isInSection()) {
@@ -363,13 +329,9 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE,
     S->setUsedInReloc();
     SRE = MCSymbolRefExpr::create(S, getContext(), SRE->getLoc());
   }
-  const MCConstantExpr *MCOffset = MCConstantExpr::create(Offset, getContext());
-  if (std::optional<std::pair<bool, std::string>> Err =
-          MCObjectStreamer::emitRelocDirective(
-              *MCOffset, "BFD_RELOC_NONE", SRE, SRE->getLoc(),
-              *getContext().getSubtargetInfo()))
-    report_fatal_error("Relocation for CG Profile could not be created: " +
-                       Twine(Err->second));
+  auto *O = MCBinaryExpr::createAdd(
+      Sym, MCConstantExpr::create(Offset, getContext()), getContext());
+  MCObjectStreamer::emitRelocDirective(*O, "BFD_RELOC_NONE", SRE);
 }
 
 void MCELFStreamer::finalizeCGProfile() {
@@ -382,126 +344,17 @@ void MCELFStreamer::finalizeCGProfile() {
   pushSection();
   switchSection(CGProfile);
   uint64_t Offset = 0;
+  auto *Sym =
+      MCSymbolRefExpr::create(CGProfile->getBeginSymbol(), getContext());
   for (auto &E : W.getCGProfile()) {
-    finalizeCGProfileEntry(E.From, Offset);
-    finalizeCGProfileEntry(E.To, Offset);
+    finalizeCGProfileEntry(Sym, Offset, E.From);
+    finalizeCGProfileEntry(Sym, Offset, E.To);
     emitIntValue(E.Count, sizeof(uint64_t));
     Offset += sizeof(uint64_t);
   }
   popSection();
 }
 
-// A fragment can only have one Subtarget, and when bundling is enabled we
-// sometimes need to use the same fragment. We give an error if there
-// are conflicting Subtargets.
-static void CheckBundleSubtargets(const MCSubtargetInfo *OldSTI,
-                                  const MCSubtargetInfo *NewSTI) {
-  if (OldSTI && NewSTI && OldSTI != NewSTI)
-    report_fatal_error("A Bundle can only have one Subtarget.");
-}
-
-void MCELFStreamer::emitInstToData(const MCInst &Inst,
-                                   const MCSubtargetInfo &STI) {
-  MCAssembler &Assembler = getAssembler();
-
-  // There are several possibilities here:
-  //
-  // If bundling is disabled, append the encoded instruction to the current data
-  // fragment (or create a new such fragment if the current fragment is not a
-  // data fragment, or the Subtarget has changed).
-  //
-  // If bundling is enabled:
-  // - If we're not in a bundle-locked group, emit the instruction into a
-  //   fragment of its own.
-  // - If we're in a bundle-locked group, append the instruction to the current
-  //   data fragment because we want all the instructions in a group to get into
-  //   the same fragment. Be careful not to do that for the first instruction in
-  //   the group, though.
-  MCDataFragment *DF;
-
-  if (Assembler.isBundlingEnabled()) {
-    MCSection &Sec = *getCurrentSectionOnly();
-    if (isBundleLocked() && !Sec.isBundleGroupBeforeFirstInst()) {
-      // If we are bundle-locked, we re-use the current fragment.
-      // The bundle-locking directive ensures this is a new data fragment.
-      DF = cast<MCDataFragment>(getCurrentFragment());
-      CheckBundleSubtargets(DF->getSubtargetInfo(), &STI);
-    } else {
-      DF = getContext().allocFragment<MCDataFragment>();
-      insert(DF);
-    }
-    if (Sec.getBundleLockState() == MCSection::BundleLockedAlignToEnd) {
-      // If this fragment is for a group marked "align_to_end", set a flag
-      // in the fragment. This can happen after the fragment has already been
-      // created if there are nested bundle_align groups and an inner one
-      // is the one marked align_to_end.
-      DF->setAlignToBundleEnd(true);
-    }
-
-    // We're now emitting an instruction in a bundle group, so this flag has
-    // to be turned off.
-    Sec.setBundleGroupBeforeFirstInst(false);
-  } else {
-    DF = getOrCreateDataFragment(&STI);
-  }
-
-  // Emit instruction directly into data fragment.
-  size_t FixupStartIndex = DF->getFixups().size();
-  size_t CodeOffset = DF->getContents().size();
-  SmallVector<MCFixup, 1> Fixups;
-  Assembler.getEmitter().encodeInstruction(Inst, DF->getContentsForAppending(),
-                                           Fixups, STI);
-  DF->doneAppending();
-  if (!Fixups.empty())
-    DF->appendFixups(Fixups);
-
-  for (auto &Fixup : MutableArrayRef(DF->getFixups()).slice(FixupStartIndex)) {
-    Fixup.setOffset(Fixup.getOffset() + CodeOffset);
-    if (Fixup.isLinkerRelaxable()) {
-      DF->setLinkerRelaxable();
-      getCurrentSectionOnly()->setLinkerRelaxable();
-    }
-  }
-
-  DF->setHasInstructions(STI);
-}
-
-void MCELFStreamer::emitBundleAlignMode(Align Alignment) {
-  assert(Log2(Alignment) <= 30 && "Invalid bundle alignment");
-  MCAssembler &Assembler = getAssembler();
-  if (Alignment > 1 && (Assembler.getBundleAlignSize() == 0 ||
-                        Assembler.getBundleAlignSize() == Alignment.value()))
-    Assembler.setBundleAlignSize(Alignment.value());
-  else
-    report_fatal_error(".bundle_align_mode cannot be changed once set");
-}
-
-void MCELFStreamer::emitBundleLock(bool AlignToEnd) {
-  MCSection &Sec = *getCurrentSectionOnly();
-
-  if (!getAssembler().isBundlingEnabled())
-    report_fatal_error(".bundle_lock forbidden when bundling is disabled");
-
-  if (!isBundleLocked())
-    Sec.setBundleGroupBeforeFirstInst(true);
-
-  Sec.setBundleLockState(AlignToEnd ? MCSection::BundleLockedAlignToEnd
-                                    : MCSection::BundleLocked);
-}
-
-void MCELFStreamer::emitBundleUnlock() {
-  MCSection &Sec = *getCurrentSectionOnly();
-
-  if (!getAssembler().isBundlingEnabled())
-    report_fatal_error(".bundle_unlock forbidden when bundling is disabled");
-  else if (!isBundleLocked())
-    report_fatal_error(".bundle_unlock without matching lock");
-  else if (Sec.isBundleGroupBeforeFirstInst())
-    report_fatal_error("Empty bundle-locked group is forbidden");
-
-  Sec.setBundleLockState(MCSection::NotBundleLocked);
-}
-
 void MCELFStreamer::finishImpl() {
   // Emit the .gnu attributes section if any attributes have been added.
   if (!GNUAttributes.empty()) {
@@ -510,10 +363,6 @@ void MCELFStreamer::finishImpl() {
                             DummyAttributeSection, GNUAttributes);
   }
 
-  // Ensure the last section gets aligned if necessary.
-  if (MCFragment *F = getCurrentFragment())
-    setSectionAlignmentForBundling(getAssembler(), F->getParent());
-
   finalizeCGProfile();
   emitFrames(nullptr);
 
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index a196591744099..22dff497911de 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -324,7 +324,7 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
     // symbols is limited to specific cases where the fragments between two
     // symbols (including the fragments the symbols are defined in) are
     // fixed-size fragments so the difference can be calculated. For example,
-    // this is important when the Subtarget is changed and a new MCDataFragment
+    // this is important when the Subtarget is changed and a new MCFragment
     // is created in the case of foo: instr; .arch_extension ext; instr .if . -
     // foo.
     if (SA.isVariable() || SB.isVariable())
@@ -352,7 +352,7 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
     // the linker.
     bool BBeforeRelax = false, AAfterRelax = false;
     for (auto F = FB; F; F = F->getNext()) {
-      auto DF = dyn_cast<MCDataFragment>(F);
+      auto DF = F->getKind() == MCFragment::FT_Data ? F : nullptr;
       if (DF && DF->isLinkerRelaxable()) {
         if (&*F != FB || SBOffset != DF->getContents().size())
           BBeforeRelax = true;
@@ -373,12 +373,12 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
       unsigned Count;
       if (DF) {
         Displacement += DF->getContents().size();
-      } else if (auto *RF = dyn_cast<MCRelaxableFragment>(F);
-                 RF && Asm->hasFinalLayout()) {
+      } else if (F->getKind() == MCFragment::FT_Relaxable &&
+                 Asm->hasFinalLayout()) {
         // Before finishLayout, a relaxable fragment's size is indeterminate.
         // After layout, during relocation generation, it can be treated as a
         // data fragment.
-        Displacement += RF->getContents().size();
+        Displacement += F->getSize();
       } else if (auto *AF = dyn_cast<MCAlignFragment>(F);
                  AF && Layout && AF->hasEmitNops() &&
                  !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index c59fabe5df1b9..bfe045abe6e53 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -24,12 +24,15 @@
 
 using namespace llvm;
 
-static_assert(std::is_trivially_destructible_v<MCDataFragment>,
+static_assert(std::is_trivially_destructible_v<MCFragment>,
               "fragment classes must be trivially destructible");
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions)
-    : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false),
-      LinkerRelaxable(false), AllowAutoPadding(false) {}
+    : Kind(Kind), LinkerRelaxable(false), HasInstructions(HasInstructions),
+      AllowAutoPadding(false) {
+  static_assert(sizeof(MCFragment::Tail) <= 16,
+                "Keep the variable-size tail small");
+}
 
 const MCSymbol *MCFragment::getAtom() const {
   return cast<MCSectionMachO>(Parent)->getAtom(LayoutOrder);
@@ -59,10 +62,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     // clang-format on
   }
 
-  if (const auto *EF = dyn_cast<MCEncodedFragment>(this))
-    if (auto Pad = static_cast<unsigned>(EF->getBundlePadding()))
-      OS << " BundlePadding:" << Pad;
-
   auto printFixups = [&](llvm::ArrayRef<MCFixup> Fixups) {
     if (Fixups.empty())
       return;
@@ -83,18 +82,56 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
       OS << " Nops";
     break;
   }
-  case MCFragment::FT_Data:  {
-    const auto *F = cast<MCDataFragment>(this);
-    if (F->isLinkerRelaxable())
+  case MCFragment::FT_Data:
+  case MCFragment::FT_Relaxable:
+  case MCFragment::FT_LEB:
+  case MCFragment::FT_Dwarf:
+  case MCFragment::FT_DwarfFrame: {
+    if (isLinkerRelaxable())
       OS << " LinkerRelaxable";
-    auto Contents = F->getContents();
-    OS << " Size:" << Contents.size() << " [";
-    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
+    auto Fixed = getContents();
+    auto Var = getVarContents();
+    OS << " Size:" << Fixed.size();
+    if (getKind() != MCFragment::FT_Data)
+      OS << '+' << Var.size();
+    OS << " [";
+    for (unsigned i = 0, e = Fixed.size(); i != e; ++i) {
       if (i) OS << ",";
-      OS << format("%02x", uint8_t(Contents[i]));
+      OS << format("%02x", uint8_t(Fixed[i]));
+    }
+    for (unsigned i = 0, e = Var.size(); i != e; ++i) {
+      if (Fixed.size() || i)
+        OS << ",";
+      OS << format("%02x", uint8_t(Var[i]));
     }
     OS << ']';
-    printFixups(F->getFixups());
+    switch (getKind()) {
+    case MCFragment::FT_Data:
+      break;
+    case MCFragment::FT_Relaxable:
+      OS << ' ';
+      getInst().dump_pretty(OS);
+      break;
+    case MCFragment::FT_LEB: {
+      OS << " Value:";
+      getLEBValue().print(OS, nullptr);
+      OS << " Signed:" << isLEBSigned();
+      break;
+    }
+    case MCFragment::FT_Dwarf:
+      OS << " AddrDelta:";
+      getDwarfAddrDelta().print(OS, nullptr);
+      OS << " LineDelta:" << getDwarfLineDelta();
+      break;
+    case MCFragment::FT_DwarfFrame:
+      OS << " AddrDelta:";
+      getDwarfAddrDelta().print(OS, nullptr);
+      break;
+    default:
+      llvm_unreachable("");
+    }
+    printFixups(getFixups());
+    printFixups(getVarFixups());
     break;
   }
   case MCFragment::FT_Fill:  {
@@ -111,13 +148,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
        << " ControlledNopLength:" << NF->getControlledNopLength();
     break;
   }
-  case MCFragment::FT_Relaxable:  {
-    const auto *F = cast<MCRelaxableFragment>(this);
-    OS << " Size:" << F->getContents().size() << ' ';
-    F->getInst().dump_pretty(OS);
-    printFixups(F->getFixups());
-    break;
-  }
   case MCFragment::FT_Org:  {
     const auto *OF = cast<MCOrgFragment>(this);
     OS << " Offset:";
@@ -125,26 +155,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     OS << " Value:" << static_cast<unsigned>(OF->getValue());
     break;
   }
-  case MCFragment::FT_Dwarf:  {
-    const auto *OF = cast<MCDwarfLineAddrFragment>(this);
-    OS << " AddrDelta:";
-    OF->getAddrDelta().print(OS, nullptr);
-    OS << " LineDelta:" << OF->getLineDelta();
-    break;
-  }
-  case MCFragment::FT_DwarfFrame:  {
-    const auto *CF = cast<MCDwarfCallFrameFragment>(this);
-    OS << " AddrDelta:";
-    CF->getAddrDelta().print(OS, nullptr);
-    break;
-  }
-  case MCFragment::FT_LEB: {
-    const auto *LF = cast<MCLEBFragment>(this);
-    OS << " Value:";
-    LF->getValue().print(OS, nullptr);
-    OS << " Signed:" << LF->isSigned();
-    break;
-  }
   case MCFragment::FT_BoundaryAlign: {
     const auto *BF = cast<MCBoundaryAlignFragment>(this);
     OS << " BoundarySize:" << BF->getAlignment().value()
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index a54dfb9cbaea1..43598ef038b96 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -161,7 +161,7 @@ void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (cast<MCSymbolMachO>(Symbol)->isSymbolLinkerVisible())
-    insert(getContext().allocFragment<MCDataFragment>());
+    insert(getContext().allocFragment<MCFragment>());
 
   MCObjectStreamer::emitLabel(Symbol, Loc);
 
@@ -483,8 +483,7 @@ void MCMachOStreamer::finalizeCGProfile() {
   // For each entry, reserve space for 2 32-bit indices and a 64-bit count.
   size_t SectionBytes =
       W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
-  cast<MCDataFragment>(*CGProfileSection->begin())
-      .appendContents(SectionBytes, 0);
+  (*CGProfileSection->begin()).appendContents(SectionBytes, 0);
 }
 
 MCStreamer *llvm::createMachOStreamer(MCContext &Context,
@@ -513,7 +512,7 @@ void MCMachOStreamer::createAddrSigSection() {
   MCSection *AddrSigSection =
       Asm.getContext().getObjectFileInfo()->getAddrSigSection();
   changeSection(AddrSigSection);
-  auto *Frag = cast<MCDataFragment>(AddrSigSection->curFragList()->Head);
+  auto *Frag = cast<MCFragment>(AddrSigSection->curFragList()->Head);
   // We will generate a series of pointer-sized symbol relocations at offset
   // 0x0. Set the section size to be large enough to contain a single pointer
   // (instead of emitting a zero-sized section) so these relocations are
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 44a82f75576b6..67433f2b265e5 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -46,37 +46,9 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
   return nullptr;
 }
 
-// When fixup's offset is a forward declared label, e.g.:
-//
-//   .reloc 1f, R_MIPS_JALR, foo
-// 1: nop
-//
-// postpone adding it to Fixups vector until the label is defined and its offset
-// is known.
-void MCObjectStreamer::resolvePendingFixups() {
-  for (PendingMCFixup &PendingFixup : PendingFixups) {
-    if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) {
-      getContext().reportError(PendingFixup.Fixup.getLoc(),
-                               "unresolved relocation offset");
-      continue;
-    }
-    PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset() +
-                                 PendingFixup.Fixup.getOffset());
-
-    // If the location symbol to relocate is in MCEncodedFragment,
-    // put the Fixup into location symbol's fragment. Otherwise
-    // put into PendingFixup.DF
-    MCFragment *SymFragment = PendingFixup.Sym->getFragment();
-    if (auto *F = dyn_cast<MCEncodedFragment>(SymFragment))
-      F->addFixup(PendingFixup.Fixup);
-    else
-      PendingFixup.DF->addFixup(PendingFixup.Fixup);
-  }
-  PendingFixups.clear();
-}
-
 // As a compile-time optimization, avoid allocating and evaluating an MCExpr
-// tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment.
+// tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment's fixed
+// part.
 static std::optional<uint64_t> absoluteSymbolDiff(const MCSymbol *Hi,
                                                   const MCSymbol *Lo) {
   assert(Hi && Lo);
@@ -85,8 +57,11 @@ static std::optional<uint64_t> absoluteSymbolDiff(const MCSymbol *Hi,
   if (Hi->isVariable() || Lo->isVariable())
     return std::nullopt;
   auto *LoF = Lo->getFragment();
-  if (!LoF || LoF->getKind() != MCFragment::FT_Data ||
-      Hi->getFragment() != LoF || LoF->isLinkerRelaxable())
+  if (!LoF || Hi->getFragment() != LoF || LoF->isLinkerRelaxable())
+    return std::nullopt;
+  // If either symbol resides in the variable part, bail out.
+  auto Fixed = LoF->getFixedSize();
+  if (Lo->getOffset() > Fixed || Hi->getOffset() > Fixed)
     return std::nullopt;
 
   return Hi->getOffset() - Lo->getOffset();
@@ -131,7 +106,7 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
     MCDwarfFrameEmitter::Emit(*this, MAB, false);
 }
 
-static bool canReuseDataFragment(const MCDataFragment &F,
+static bool canReuseDataFragment(const MCFragment &F,
                                  const MCAssembler &Assembler,
                                  const MCSubtargetInfo *STI) {
   if (!F.hasInstructions())
@@ -141,20 +116,17 @@ static bool canReuseDataFragment(const MCDataFragment &F,
   // instruction cannot be resolved at assemble-time.
   if (F.isLinkerRelaxable())
     return false;
-  // When bundling is enabled, we don't want to add data to a fragment that
-  // already has instructions (see MCELFStreamer::emitInstToData for details)
-  if (Assembler.isBundlingEnabled())
-    return false;
   // If the subtarget is changed mid fragment we start a new fragment to record
   // the new STI.
   return !STI || F.getSubtargetInfo() == STI;
 }
 
-MCDataFragment *
+MCFragment *
 MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) {
-  auto *F = dyn_cast<MCDataFragment>(getCurrentFragment());
-  if (!F || !canReuseDataFragment(*F, *Assembler, STI)) {
-    F = getContext().allocFragment<MCDataFragment>();
+  auto *F = getCurrentFragment();
+  if (F->getKind() != MCFragment::FT_Data ||
+      !canReuseDataFragment(*F, *Assembler, STI)) {
+    F = getContext().allocFragment<MCFragment>();
     insert(F);
   }
   return F;
@@ -173,7 +145,7 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug) {
 void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                      SMLoc Loc) {
   MCStreamer::emitValueImpl(Value, Size, Loc);
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
 
   MCDwarfLineEntry::make(this, getCurrentSectionOnly());
 
@@ -222,7 +194,7 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // If there is a current fragment, mark the symbol as pointing into it.
   // Otherwise queue the label and set its fragment pointer when we emit the
   // next fragment.
-  MCDataFragment *F = getOrCreateDataFragment();
+  MCFragment *F = getOrCreateDataFragment();
   Symbol->setFragment(F);
   Symbol->setOffset(F->getContents().size());
 
@@ -242,7 +214,7 @@ void MCObjectStreamer::emitPendingAssignments(MCSymbol *Symbol) {
 // Emit a label at a previously emitted fragment/offset position. This must be
 // within the currently-active section.
 void MCObjectStreamer::emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc,
-                                      MCDataFragment &F, uint64_t Offset) {
+                                      MCFragment &F, uint64_t Offset) {
   assert(F.getParent() == getCurrentSectionOnly());
   MCStreamer::emitLabel(Symbol, Loc);
   getAssembler().registerSymbol(*Symbol);
@@ -256,7 +228,10 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
     emitULEB128IntValue(IntValue);
     return;
   }
-  insert(getContext().allocFragment<MCLEBFragment>(*Value, false));
+  auto *F = getOrCreateDataFragment();
+  F->Kind = MCFragment::FT_LEB;
+  F->setLEBSigned(false);
+  F->setLEBValue(Value);
 }
 
 void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
@@ -265,7 +240,10 @@ void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
     emitSLEB128IntValue(IntValue);
     return;
   }
-  insert(getContext().allocFragment<MCLEBFragment>(*Value, true));
+  auto *F = getOrCreateDataFragment();
+  F->Kind = MCFragment::FT_LEB;
+  F->setLEBSigned(true);
+  F->setLEBValue(Value);
 }
 
 void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
@@ -289,7 +267,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
   // If the subsection number is not in the sorted Subsections list, create a
   // new fragment list.
   if (I == E || Subsections[I].first != Subsection) {
-    auto *F = getContext().allocFragment<MCDataFragment>();
+    auto *F = getContext().allocFragment<MCFragment>();
     F->setParent(Section);
     Subsections.insert(Subsections.begin() + I,
                        {Subsection, MCSection::FragList{F, F}});
@@ -359,13 +337,8 @@ void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
     return;
   }
 
-  // Otherwise, relax and emit it as data if either:
-  // - The RelaxAll flag was passed
-  // - Bundling is enabled and this instruction is inside a bundle-locked
-  //   group. We want to emit all such instructions into the same data
-  //   fragment.
-  if (Assembler.getRelaxAll() ||
-      (Assembler.isBundlingEnabled() && Sec->isBundleLocked())) {
+  // Otherwise, relax and emit it as data if RelaxAll is specified.
+  if (Assembler.getRelaxAll()) {
     MCInst Relaxed = Inst;
     while (Backend.mayNeedRelaxation(Relaxed.getOpcode(), Relaxed.getOperands(),
                                      STI))
@@ -374,57 +347,47 @@ void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
     return;
   }
 
-  // Otherwise emit to a separate fragment.
   emitInstToFragment(Inst, STI);
 }
 
 void MCObjectStreamer::emitInstToData(const MCInst &Inst,
                                       const MCSubtargetInfo &STI) {
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *F = getOrCreateDataFragment();
+
+  // Append the instruction to the data fragment.
+  size_t FixupStartIndex = F->getFixups().size();
+  size_t CodeOffset = F->getContents().size();
   SmallVector<MCFixup, 1> Fixups;
-  SmallString<256> Code;
-  getAssembler().getEmitter().encodeInstruction(Inst, Code, Fixups, STI);
+  getAssembler().getEmitter().encodeInstruction(
+      Inst, F->getContentsForAppending(), Fixups, STI);
+  F->doneAppending();
+  if (!Fixups.empty())
+    F->appendFixups(Fixups);
 
-  auto CodeOffset = DF->getContents().size();
-  for (MCFixup &Fixup : Fixups)
+  for (auto &Fixup : MutableArrayRef(F->getFixups()).slice(FixupStartIndex)) {
     Fixup.setOffset(Fixup.getOffset() + CodeOffset);
-  if (!Fixups.empty())
-    DF->appendFixups(Fixups);
-  DF->setHasInstructions(STI);
-  DF->appendContents(Code);
+    if (Fixup.isLinkerRelaxable()) {
+      F->setLinkerRelaxable();
+      getCurrentSectionOnly()->setLinkerRelaxable();
+    }
+  }
+
+  F->setHasInstructions(STI);
 }
 
 void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
                                           const MCSubtargetInfo &STI) {
-  // Always create a new, separate fragment here, because its size can change
-  // during relaxation.
-  MCRelaxableFragment *IF =
-      getContext().allocFragment<MCRelaxableFragment>(STI);
-  insert(IF);
-  IF->setInst(Inst);
-
+  auto *F = getOrCreateDataFragment();
+  SmallVector<char, 16> Data;
   SmallVector<MCFixup, 1> Fixups;
-  getAssembler().getEmitter().encodeInstruction(
-      Inst, IF->getContentsForAppending(), Fixups, STI);
-  IF->doneAppending();
-  IF->appendFixups(Fixups);
-}
-
-#ifndef NDEBUG
-static const char *const BundlingNotImplementedMsg =
-  "Aligned bundling is not implemented for this object format";
-#endif
+  getAssembler().getEmitter().encodeInstruction(Inst, Data, Fixups, STI);
 
-void MCObjectStreamer::emitBundleAlignMode(Align Alignment) {
-  llvm_unreachable(BundlingNotImplementedMsg);
-}
-
-void MCObjectStreamer::emitBundleLock(bool AlignToEnd) {
-  llvm_unreachable(BundlingNotImplementedMsg);
-}
-
-void MCObjectStreamer::emitBundleUnlock() {
-  llvm_unreachable(BundlingNotImplementedMsg);
+  F->Kind = MCFragment::FT_Relaxable;
+  F->STI = &STI;
+  F->HasInstructions = true;
+  F->setVarContents(Data);
+  F->setVarFixups(Fixups);
+  F->setInst(Inst);
 }
 
 void MCObjectStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
@@ -486,9 +449,10 @@ void MCObjectStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta,
     return;
   }
 
-  const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel, SMLoc());
-  insert(getContext().allocFragment<MCDwarfLineAddrFragment>(LineDelta,
-                                                             *AddrDelta));
+  auto *F = getOrCreateDataFragment();
+  F->Kind = MCFragment::FT_Dwarf;
+  F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, SMLoc()));
+  F->setDwarfLineDelta(LineDelta);
 }
 
 void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
@@ -516,8 +480,9 @@ void MCObjectStreamer::emitDwarfLineEndEntry(MCSection *Section,
 void MCObjectStreamer::emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                                  const MCSymbol *Label,
                                                  SMLoc Loc) {
-  const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel, Loc);
-  insert(getContext().allocFragment<MCDwarfCallFrameFragment>(*AddrDelta));
+  auto *F = getOrCreateDataFragment();
+  F->Kind = MCFragment::FT_DwarfFrame;
+  F->setDwarfAddrDelta(buildSymbolDiff(*this, Label, LastLabel, Loc));
 }
 
 void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
@@ -576,7 +541,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
 
 void MCObjectStreamer::emitBytes(StringRef Data) {
   MCDwarfLineEntry::make(this, getCurrentSectionOnly());
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   DF->appendContents(ArrayRef(Data.data(), Data.size()));
 }
 
@@ -613,76 +578,14 @@ void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
   insert(getContext().allocFragment<MCOrgFragment>(*Offset, Value, Loc));
 }
 
-static std::optional<std::pair<bool, std::string>>
-getOffsetAndDataFragment(const MCSymbol &Symbol, uint32_t &RelocOffset,
-                         MCDataFragment *&DF) {
-  if (Symbol.isVariable()) {
-    const MCExpr *SymbolExpr = Symbol.getVariableValue();
-    MCValue OffsetVal;
-    if (!SymbolExpr->evaluateAsRelocatable(OffsetVal, nullptr))
-      return std::make_pair(false,
-                            std::string("symbol in .reloc offset is not "
-                                        "relocatable"));
-    if (OffsetVal.isAbsolute()) {
-      RelocOffset = OffsetVal.getConstant();
-      MCFragment *Fragment = Symbol.getFragment();
-      // FIXME Support symbols with no DF. For example:
-      // .reloc .data, ENUM_VALUE, <some expr>
-      if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
-        return std::make_pair(false,
-                              std::string("symbol in offset has no data "
-                                          "fragment"));
-      DF = cast<MCDataFragment>(Fragment);
-      return std::nullopt;
-    }
-
-    if (OffsetVal.getSubSym())
-      return std::make_pair(false,
-                            std::string(".reloc symbol offset is not "
-                                        "representable"));
-
-    const MCSymbol &SA = *OffsetVal.getAddSym();
-    if (!SA.isDefined())
-      return std::make_pair(false,
-                            std::string("symbol used in the .reloc offset is "
-                                        "not defined"));
-
-    if (SA.isVariable())
-      return std::make_pair(false,
-                            std::string("symbol used in the .reloc offset is "
-                                        "variable"));
-
-    MCFragment *Fragment = SA.getFragment();
-    // FIXME Support symbols with no DF. For example:
-    // .reloc .data, ENUM_VALUE, <some expr>
-    if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
-      return std::make_pair(false,
-                            std::string("symbol in offset has no data "
-                                        "fragment"));
-    RelocOffset = SA.getOffset() + OffsetVal.getConstant();
-    DF = cast<MCDataFragment>(Fragment);
-  } else {
-    RelocOffset = Symbol.getOffset();
-    MCFragment *Fragment = Symbol.getFragment();
-    // FIXME Support symbols with no DF. For example:
-    // .reloc .data, ENUM_VALUE, <some expr>
-    if (!Fragment || Fragment->getKind() != MCFragment::FT_Data)
-      return std::make_pair(false,
-                            std::string("symbol in offset has no data "
-                                        "fragment"));
-    DF = cast<MCDataFragment>(Fragment);
-  }
-  return std::nullopt;
-}
-
-std::optional<std::pair<bool, std::string>>
-MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                     const MCExpr *Expr, SMLoc Loc,
-                                     const MCSubtargetInfo &STI) {
+void MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                          const MCExpr *Expr, SMLoc Loc) {
   std::optional<MCFixupKind> MaybeKind =
       Assembler->getBackend().getFixupKind(Name);
-  if (!MaybeKind)
-    return std::make_pair(true, std::string("unknown relocation name"));
+  if (!MaybeKind) {
+    getContext().reportError(Loc, "unknown relocation name");
+    return;
+  }
 
   MCFixupKind Kind = *MaybeKind;
   if (Expr)
@@ -691,38 +594,14 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
     Expr =
         MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext());
 
-  MCDataFragment *DF = getOrCreateDataFragment(&STI);
-  MCValue OffsetVal;
-  if (!Offset.evaluateAsRelocatable(OffsetVal, nullptr))
-    return std::make_pair(false,
-                          std::string(".reloc offset is not relocatable"));
-  if (OffsetVal.isAbsolute()) {
-    if (OffsetVal.getConstant() < 0)
-      return std::make_pair(false, std::string(".reloc offset is negative"));
-    DF->addFixup(MCFixup::create(OffsetVal.getConstant(), Expr, Kind));
-    return std::nullopt;
+  auto *O = &Offset;
+  int64_t Val;
+  if (Offset.evaluateAsAbsolute(Val, nullptr)) {
+    auto *SecSym = getCurrentSectionOnly()->getBeginSymbol();
+    O = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(SecSym, getContext()),
+                                O, getContext(), Loc);
   }
-  if (OffsetVal.getSubSym())
-    return std::make_pair(false,
-                          std::string(".reloc offset is not representable"));
-
-  const MCSymbol &Symbol = *OffsetVal.getAddSym();
-  if (Symbol.isDefined()) {
-    uint32_t SymbolOffset = 0;
-    std::optional<std::pair<bool, std::string>> Error =
-        getOffsetAndDataFragment(Symbol, SymbolOffset, DF);
-
-    if (Error != std::nullopt)
-      return Error;
-
-    DF->addFixup(
-        MCFixup::create(SymbolOffset + OffsetVal.getConstant(), Expr, Kind));
-    return std::nullopt;
-  }
-
-  PendingFixups.emplace_back(
-      &Symbol, DF, MCFixup::create(OffsetVal.getConstant(), Expr, Kind));
-  return std::nullopt;
+  getAssembler().addRelocDirective({*O, Expr, Kind});
 }
 
 void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
@@ -805,6 +684,5 @@ void MCObjectStreamer::finishImpl() {
   // Emit pseudo probes for the current module.
   MCPseudoProbeTable::emit(this);
 
-  resolvePendingFixups();
   getAssembler().Finish();
 }
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 9fd6c05a846db..77bf84364c5a3 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -421,9 +421,6 @@ class AsmParser : public MCAsmParser {
     DK_ORG,
     DK_FILL,
     DK_ENDR,
-    DK_BUNDLE_ALIGN_MODE,
-    DK_BUNDLE_LOCK,
-    DK_BUNDLE_UNLOCK,
     DK_ZERO,
     DK_EXTERN,
     DK_GLOBL,
@@ -624,12 +621,6 @@ class AsmParser : public MCAsmParser {
   bool parseDirectiveMacrosOnOff(StringRef Directive);
   // alternate macro mode directives
   bool parseDirectiveAltmacro(StringRef Directive);
-  // ".bundle_align_mode"
-  bool parseDirectiveBundleAlignMode();
-  // ".bundle_lock"
-  bool parseDirectiveBundleLock();
-  // ".bundle_unlock"
-  bool parseDirectiveBundleUnlock();
 
   // ".space", ".skip"
   bool parseDirectiveSpace(StringRef IDVal);
@@ -2062,12 +2053,6 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveIrpc(IDLoc);
     case DK_ENDR:
       return parseDirectiveEndr(IDLoc);
-    case DK_BUNDLE_ALIGN_MODE:
-      return parseDirectiveBundleAlignMode();
-    case DK_BUNDLE_LOCK:
-      return parseDirectiveBundleLock();
-    case DK_BUNDLE_UNLOCK:
-      return parseDirectiveBundleUnlock();
     case DK_SLEB128:
       return parseDirectiveLEB128(true);
     case DK_ULEB128:
@@ -3094,7 +3079,6 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
 bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
   const MCExpr *Offset;
   const MCExpr *Expr = nullptr;
-  SMLoc OffsetLoc = Lexer.getTok().getLoc();
 
   if (parseExpression(Offset))
     return true;
@@ -3120,13 +3104,7 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
   if (parseEOL())
     return true;
 
-  const MCTargetAsmParser &MCT = getTargetParser();
-  const MCSubtargetInfo &STI = MCT.getSTI();
-  if (std::optional<std::pair<bool, std::string>> Err =
-          getStreamer().emitRelocDirective(*Offset, Name, Expr, DirectiveLoc,
-                                           STI))
-    return Error(Err->first ? NameLoc : OffsetLoc, Err->second);
-
+  getStreamer().emitRelocDirective(*Offset, Name, Expr, NameLoc);
   return false;
 }
 
@@ -4779,56 +4757,6 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// parseDirectiveBundleAlignMode
-/// ::= {.bundle_align_mode} expression
-bool AsmParser::parseDirectiveBundleAlignMode() {
-  // Expect a single argument: an expression that evaluates to a constant
-  // in the inclusive range 0-30.
-  SMLoc ExprLoc = getLexer().getLoc();
-  int64_t AlignSizePow2;
-  if (checkForValidSection() || parseAbsoluteExpression(AlignSizePow2) ||
-      parseEOL() ||
-      check(AlignSizePow2 < 0 || AlignSizePow2 > 30, ExprLoc,
-            "invalid bundle alignment size (expected between 0 and 30)"))
-    return true;
-
-  getStreamer().emitBundleAlignMode(Align(1ULL << AlignSizePow2));
-  return false;
-}
-
-/// parseDirectiveBundleLock
-/// ::= {.bundle_lock} [align_to_end]
-bool AsmParser::parseDirectiveBundleLock() {
-  if (checkForValidSection())
-    return true;
-  bool AlignToEnd = false;
-
-  StringRef Option;
-  SMLoc Loc = getTok().getLoc();
-  const char *kInvalidOptionError =
-      "invalid option for '.bundle_lock' directive";
-
-  if (!parseOptionalToken(AsmToken::EndOfStatement)) {
-    if (check(parseIdentifier(Option), Loc, kInvalidOptionError) ||
-        check(Option != "align_to_end", Loc, kInvalidOptionError) || parseEOL())
-      return true;
-    AlignToEnd = true;
-  }
-
-  getStreamer().emitBundleLock(AlignToEnd);
-  return false;
-}
-
-/// parseDirectiveBundleLock
-/// ::= {.bundle_lock}
-bool AsmParser::parseDirectiveBundleUnlock() {
-  if (checkForValidSection() || parseEOL())
-    return true;
-
-  getStreamer().emitBundleUnlock();
-  return false;
-}
-
 /// parseDirectiveSpace
 /// ::= (.skip | .space) expression [ , expression ]
 bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
@@ -5477,9 +5405,6 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".irp"] = DK_IRP;
   DirectiveKindMap[".irpc"] = DK_IRPC;
   DirectiveKindMap[".endr"] = DK_ENDR;
-  DirectiveKindMap[".bundle_align_mode"] = DK_BUNDLE_ALIGN_MODE;
-  DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK;
-  DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK;
   DirectiveKindMap[".if"] = DK_IF;
   DirectiveKindMap[".ifeq"] = DK_IFEQ;
   DirectiveKindMap[".ifge"] = DK_IFGE;
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 56450033529bc..93671450c0c2f 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -21,9 +20,8 @@ using namespace llvm;
 
 MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText,
                      bool IsVirtual, MCSymbol *Begin)
-    : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
-      IsRegistered(false), IsText(IsText), IsVirtual(IsVirtual),
-      LinkerRelaxable(false), Name(Name), Variant(V) {
+    : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
+      IsVirtual(IsVirtual), LinkerRelaxable(false), Name(Name), Variant(V) {
   // The initial subsection number is 0. Create a fragment list.
   CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
 }
@@ -36,25 +34,6 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
 
 bool MCSection::hasEnded() const { return End && End->isInSection(); }
 
-void MCSection::setBundleLockState(BundleLockStateType NewState) {
-  if (NewState == NotBundleLocked) {
-    if (BundleLockNestingDepth == 0) {
-      report_fatal_error("Mismatched bundle_lock/unlock directives");
-    }
-    if (--BundleLockNestingDepth == 0) {
-      BundleLockState = NotBundleLocked;
-    }
-    return;
-  }
-
-  // If any of the directives is an align_to_end directive, the whole nested
-  // group is align_to_end. So don't downgrade from align_to_end to just locked.
-  if (BundleLockState != BundleLockedAlignToEnd) {
-    BundleLockState = NewState;
-  }
-  ++BundleLockNestingDepth;
-}
-
 StringRef MCSection::getVirtualSectionKind() const { return "virtual"; }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -81,7 +60,7 @@ LLVM_DUMP_METHOD void MCSection::dump(
 }
 #endif
 
-void MCEncodedFragment::setContents(ArrayRef<char> Contents) {
+void MCFragment::setContents(ArrayRef<char> Contents) {
   auto &S = getParent()->ContentStorage;
   if (ContentStart + Contents.size() > ContentEnd) {
     ContentStart = S.size();
@@ -91,9 +70,19 @@ void MCEncodedFragment::setContents(ArrayRef<char> Contents) {
   llvm::copy(Contents, S.begin() + ContentStart);
 }
 
-void MCEncodedFragment::addFixup(MCFixup Fixup) { appendFixups({Fixup}); }
+void MCFragment::setVarContents(ArrayRef<char> Contents) {
+  auto &S = getParent()->ContentStorage;
+  if (VarContentStart + Contents.size() > VarContentEnd) {
+    VarContentStart = S.size();
+    S.resize_for_overwrite(S.size() + Contents.size());
+  }
+  VarContentEnd = VarContentStart + Contents.size();
+  llvm::copy(Contents, S.begin() + VarContentStart);
+}
 
-void MCEncodedFragment::appendFixups(ArrayRef<MCFixup> Fixups) {
+void MCFragment::addFixup(MCFixup Fixup) { appendFixups({Fixup}); }
+
+void MCFragment::appendFixups(ArrayRef<MCFixup> Fixups) {
   auto &S = getParent()->FixupStorage;
   if (LLVM_UNLIKELY(FixupEnd != S.size())) {
     // Move the elements to the end. Reserve space to avoid invalidating
@@ -107,7 +96,7 @@ void MCEncodedFragment::appendFixups(ArrayRef<MCFixup> Fixups) {
   FixupEnd = S.size();
 }
 
-void MCEncodedFragment::setFixups(ArrayRef<MCFixup> Fixups) {
+void MCFragment::setFixups(ArrayRef<MCFixup> Fixups) {
   auto &S = getParent()->FixupStorage;
   if (FixupStart + Fixups.size() > FixupEnd) {
     FixupStart = S.size();
@@ -116,3 +105,19 @@ void MCEncodedFragment::setFixups(ArrayRef<MCFixup> Fixups) {
   FixupEnd = FixupStart + Fixups.size();
   llvm::copy(Fixups, S.begin() + FixupStart);
 }
+
+void MCFragment::setVarFixups(ArrayRef<MCFixup> Fixups) {
+  auto &S = getParent()->FixupStorage;
+  if (VarFixupStart + Fixups.size() > VarFixupEnd) {
+    VarFixupStart = S.size();
+    S.resize_for_overwrite(S.size() + Fixups.size());
+  }
+  VarFixupEnd = VarFixupStart + Fixups.size();
+  // Source fixup offsets are relative to the variable part's start. Add the
+  // fixed part size to make them relative to the fixed part's start.
+  std::transform(Fixups.begin(), Fixups.end(), S.begin() + VarFixupStart,
+                 [Fixed = getFixedSize()](MCFixup F) {
+                   F.setOffset(Fixed + F.getOffset());
+                   return F;
+                 });
+}
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 6d2fb41760044..d814ab8880500 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1333,10 +1333,7 @@ void MCStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                    unsigned MaxBytesToEmit) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
-void MCStreamer::emitBundleAlignMode(Align Alignment) {}
-void MCStreamer::emitBundleLock(bool AlignToEnd) {}
 void MCStreamer::finishImpl() {}
-void MCStreamer::emitBundleUnlock() {}
 
 bool MCStreamer::popSection() {
   if (SectionStack.size() <= 1)
diff --git a/llvm/lib/MC/MCSymbol.cpp b/llvm/lib/MC/MCSymbol.cpp
index baf1ff89c0188..8192896eeb6bb 100644
--- a/llvm/lib/MC/MCSymbol.cpp
+++ b/llvm/lib/MC/MCSymbol.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 
 // Only the address of this fragment is ever actually used.
-static MCDataFragment SentinelFragment;
+static MCFragment SentinelFragment;
 
 // Sentinel value for the absolute pseudo fragment.
 MCFragment *MCSymbol::AbsolutePseudoFragment = &SentinelFragment;
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index 602d40df4cc3a..5891420c0727d 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -11,8 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCWasmStreamer.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -47,7 +45,7 @@ void MCWasmStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
     Symbol->setTLS();
 }
 
-void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCDataFragment &F,
+void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F,
                                     uint64_t Offset) {
   auto *Symbol = cast<MCSymbolWasm>(S);
   MCObjectStreamer::emitLabelAtPos(Symbol, Loc, F, Offset);
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 8cc74db9beea4..e8b26bf291ee4 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,7 +318,7 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
 
   // Emit the epilog instructions.
   if (EnableUnwindV2) {
-    MCDataFragment *DF = OS->getOrCreateDataFragment();
+    MCFragment *DF = OS->getOrCreateDataFragment();
 
     bool IsLast = true;
     for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 0c0866f9e36a4..3398775df3f91 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -278,7 +278,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
 
 void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
   visitUsedSymbol(*Symbol);
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
   MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, FK_SecRel_2);
   DF->addFixup(Fixup);
@@ -288,7 +288,7 @@ void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
 void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
                                          uint64_t Offset) {
   visitUsedSymbol(*Symbol);
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol A for the relocation relative reference.
   const MCExpr *MCE = MCSymbolRefExpr::create(Symbol, getContext());
   // Add the constant offset, if given.
@@ -306,7 +306,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
 void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
                                          int64_t Offset) {
   visitUsedSymbol(*Symbol);
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol A for the relocation relative reference.
   const MCExpr *MCE = MCSymbolRefExpr::create(
       Symbol, MCSymbolRefExpr::VK_COFF_IMGREL32, getContext());
@@ -324,7 +324,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
 
 void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
   visitUsedSymbol(*Symbol);
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol for section number.
   const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
       *Symbol, this->getWriter(), getContext());
@@ -338,7 +338,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
 
 void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
   visitUsedSymbol(*Symbol);
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   // Create Symbol for section offset.
   const MCExpr *MCE =
       MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 12e6680bcf074..4d4529653aba9 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -89,7 +89,7 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
 void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
   // Add a Fixup here to later record a relocation of type R_REF to prevent the
   // ref symbol from being garbage collected (by the binder).
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
   std::optional<MCFixupKind> MaybeKind =
       getAssembler().getBackend().getFixupKind("R_REF");
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index fb8e1fdf62f50..3291dd774c1e0 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -799,7 +799,7 @@ uint64_t MachObjectWriter::writeObject() {
   if (!CGProfile.empty()) {
     MCSection *CGProfileSection = getContext().getMachOSection(
         "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
-    auto &Frag = cast<MCDataFragment>(*CGProfileSection->begin());
+    auto &Frag = *CGProfileSection->begin();
     Frag.clearContents();
     raw_svector_ostream OS(Frag.getContentsForAppending());
     for (const MCObjectWriter::CGProfileEntry &CGPE : CGProfile) {
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 4c226d4420e1d..7af240a73f952 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -711,10 +711,12 @@ static void addData(SmallVectorImpl<char> &DataBytes,
         llvm_unreachable("The fill should be an assembler constant");
       DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
                        Fill->getValue());
-    } else if (auto *LEB = dyn_cast<MCLEBFragment>(&Frag)) {
-      llvm::append_range(DataBytes, LEB->getContents());
     } else {
-      llvm::append_range(DataBytes, cast<MCDataFragment>(Frag).getContents());
+      llvm::append_range(DataBytes, Frag.getContents());
+      if (Frag.getKind() == MCFragment::FT_LEB)
+        llvm::append_range(DataBytes, Frag.getVarContents());
+      else
+        assert(Frag.getKind() == MCFragment::FT_Data);
     }
   }
 
@@ -1884,7 +1886,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
         if (WS.getName().substr(PrefixLength + 1).getAsInteger(10, Priority))
           report_fatal_error("invalid .init_array section priority");
       }
-      const auto &DataFrag = cast<MCDataFragment>(Frag);
+      const auto &DataFrag = Frag;
       assert(llvm::all_of(DataFrag.getContents(), [](char C) { return !C; }));
       for (const MCFixup &Fixup : DataFrag.getFixups()) {
         assert(Fixup.getKind() ==
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index c2ef430984ed4..ee4d957fe9d87 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -1069,7 +1069,7 @@ uint64_t WinCOFFWriter::writeObject() {
   if (Mode != DwoOnly && OWriter.getEmitAddrsigSection()) {
     auto *Sec = getContext().getCOFFSection(".llvm_addrsig",
                                             COFF::IMAGE_SCN_LNK_REMOVE);
-    auto *Frag = cast<MCDataFragment>(Sec->curFragList()->Head);
+    auto *Frag = Sec->curFragList()->Head;
     raw_svector_ostream OS(Frag->getContentsForAppending());
     for (const MCSymbol *S : OWriter.AddrsigSyms) {
       if (!S->isRegistered())
@@ -1092,7 +1092,7 @@ uint64_t WinCOFFWriter::writeObject() {
   if (Mode != DwoOnly && !OWriter.getCGProfile().empty()) {
     auto *Sec = getContext().getCOFFSection(".llvm.call-graph-profile",
                                             COFF::IMAGE_SCN_LNK_REMOVE);
-    auto *Frag = cast<MCDataFragment>(Sec->curFragList()->Head);
+    auto *Frag = Sec->curFragList()->Head;
     raw_svector_ostream OS(Frag->getContentsForAppending());
     for (const auto &CGPE : OWriter.getCGProfile()) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
index fdc27888c6c52..0b46ff71240b7 100644
--- a/llvm/lib/Object/DXContainer.cpp
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -9,7 +9,6 @@
 #include "llvm/Object/DXContainer.h"
 #include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/Object/Error.h"
-#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/FormatVariadic.h"
 
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index ff99dc25abe07..af073f6a1a917 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -321,6 +321,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_OFFLOADING);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES)
+    STRINGIFY_ENUM_CASE(ELF, SHT_GNU_SFRAME);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef);
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 0e13d32bbe522..5597d7db6426d 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -810,6 +810,10 @@ ELFObjectFileBase::getPltEntries(const MCSubtargetInfo &STI) const {
       JumpSlotReloc = ELF::R_HEX_JMP_SLOT;
       GlobDatReloc = ELF::R_HEX_GLOB_DAT;
       break;
+    case Triple::riscv32:
+    case Triple::riscv64:
+      JumpSlotReloc = ELF::R_RISCV_JUMP_SLOT;
+      break;
     default:
       return {};
   }
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index b6318bbe3ab74..d81899334b2b1 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -812,6 +812,7 @@ getRelocationResolver(const ObjectFile &Obj) {
       case Triple::amdgcn:
         return {supportsAmdgpu, resolveAmdgpu};
       case Triple::riscv64:
+      case Triple::riscv64be:
         return {supportsRISCV, resolveRISCV};
       default:
         if (isAMDGPU(Obj))
@@ -851,6 +852,7 @@ getRelocationResolver(const ObjectFile &Obj) {
     case Triple::r600:
       return {supportsAmdgpu, resolveAmdgpu};
     case Triple::riscv32:
+    case Triple::riscv32be:
       return {supportsRISCV, resolveRISCV};
     case Triple::csky:
       return {supportsCSKY, resolveCSKY};
@@ -897,7 +899,9 @@ uint64_t resolveRelocation(RelocationResolver Resolver, const RelocationRef &R,
         if (Obj->getArch() != Triple::loongarch32 &&
             Obj->getArch() != Triple::loongarch64 &&
             Obj->getArch() != Triple::riscv32 &&
-            Obj->getArch() != Triple::riscv64)
+            Obj->getArch() != Triple::riscv64 &&
+            Obj->getArch() != Triple::riscv32be &&
+            Obj->getArch() != Triple::riscv64be)
           LocData = 0;
       }
     }
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 970811f40f918..7fcabb684e87f 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -70,6 +70,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_PT>::enumeration(
   ECase(PT_GNU_STACK);
   ECase(PT_GNU_RELRO);
   ECase(PT_GNU_PROPERTY);
+  ECase(PT_GNU_SFRAME);
 #undef ECase
   IO.enumFallback<Hex32>(Value);
 }
@@ -723,6 +724,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   ECase(SHT_LLVM_BB_ADDR_MAP);
   ECase(SHT_LLVM_OFFLOADING);
   ECase(SHT_LLVM_LTO);
+  ECase(SHT_GNU_SFRAME);
   ECase(SHT_GNU_ATTRIBUTES);
   ECase(SHT_GNU_HASH);
   ECase(SHT_GNU_verdef);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 572e5f19a1972..80fb52f9603e8 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -94,6 +94,7 @@
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandPostRAPseudos.h"
+#include "llvm/CodeGen/ExpandReductions.h"
 #include "llvm/CodeGen/FEntryInserter.h"
 #include "llvm/CodeGen/FinalizeISel.h"
 #include "llvm/CodeGen/FixupStatepointCallerSaved.h"
@@ -142,6 +143,7 @@
 #include "llvm/CodeGen/PostRAMachineSink.h"
 #include "llvm/CodeGen/PostRASchedulerList.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
+#include "llvm/CodeGen/ProcessImplicitDefs.h"
 #include "llvm/CodeGen/RegAllocEvictionAdvisor.h"
 #include "llvm/CodeGen/RegAllocFast.h"
 #include "llvm/CodeGen/RegAllocGreedyPass.h"
@@ -153,6 +155,7 @@
 #include "llvm/CodeGen/RemoveLoadsIntoFakeUses.h"
 #include "llvm/CodeGen/RemoveRedundantDebugValues.h"
 #include "llvm/CodeGen/RenameIndependentSubregs.h"
+#include "llvm/CodeGen/ReplaceWithVeclib.h"
 #include "llvm/CodeGen/SafeStack.h"
 #include "llvm/CodeGen/SanitizerBinaryMetadata.h"
 #include "llvm/CodeGen/SelectOptimize.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 96250772da4a0..caa78b613b901 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -428,11 +428,13 @@ FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
 FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
+FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
 FUNCTION_PASS("extra-vector-passes",
               ExtraFunctionPassManager<ShouldRunExtraVectorPasses>())
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flatten-cfg", FlattenCFGPass())
 FUNCTION_PASS("float2int", Float2IntPass())
+FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass())
 FUNCTION_PASS("gc-lowering", GCLoweringPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
@@ -520,6 +522,7 @@ FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(errs()))
 FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(errs()))
 FUNCTION_PASS("reassociate", ReassociatePass())
 FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
+FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib())
 FUNCTION_PASS("reg2mem", RegToMemPass())
 FUNCTION_PASS("safe-stack", SafeStackPass(TM))
 FUNCTION_PASS("sandbox-vectorizer", SandboxVectorizerPass())
@@ -541,6 +544,7 @@ FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
+FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<cycles>", CycleInfoVerifierPass())
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 0623e66772047..f165e85baf611 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -1078,9 +1078,13 @@ void OptPassGateInstrumentation::registerCallbacks(
   if (!PassGate.isEnabled())
     return;
 
-  PIC.registerShouldRunOptionalPassCallback([this](StringRef PassName, Any IR) {
-    return this->shouldRun(PassName, IR);
-  });
+  PIC.registerShouldRunOptionalPassCallback(
+      [this, &PIC](StringRef ClassName, Any IR) {
+        StringRef PassName = PIC.getPassNameForClassName(ClassName);
+        if (PassName.empty())
+          return this->shouldRun(ClassName, IR);
+        return this->shouldRun(PassName, IR);
+      });
 }
 
 raw_ostream &PrintPassInstrumentation::print() {
diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
index d00580fe35195..19918aa708b2f 100644
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@@ -236,7 +236,7 @@ void blake3_xof_many(const uint32_t cv[8],
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
   MAYBE_UNUSED(features);
-#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512)
   if (features & AVX512VL) {
     blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
     return;
diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h
index deed079e468a5..dd71e729f208f 100644
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@@ -324,7 +324,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                              uint8_t flags, uint8_t flags_start,
                              uint8_t flags_end, uint8_t *out);
 
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__CYGWIN__)
 LLVM_LIBRARY_VISIBILITY
 void blake3_xof_many_avx512(const uint32_t cv[8],
                             const uint8_t block[BLAKE3_BLOCK_LEN],
diff --git a/llvm/lib/Support/BLAKE3/blake3_neon.c b/llvm/lib/Support/BLAKE3/blake3_neon.c
index 9629e10836864..ee36721f87573 100644
--- a/llvm/lib/Support/BLAKE3/blake3_neon.c
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
@@ -245,10 +245,11 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
       counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
 }
 
-void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
-                       const uint32_t key[8], uint64_t counter,
-                       bool increment_counter, uint8_t flags,
-                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+static void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+                              const uint32_t key[8], uint64_t counter,
+                              bool increment_counter, uint8_t flags,
+                              uint8_t flags_start, uint8_t flags_end,
+                              uint8_t *out) {
   uint32x4_t h_vecs[8] = {
       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
diff --git a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
index d5be360815add..d24657465dd8f 100644
--- a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
+++ b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
@@ -10,7 +10,9 @@
 #define blake3_hasher llvm_blake3_hasher
 #define blake3_chunk_state llvm_blake3_chunk_state
 #define blake3_compress_in_place llvm_blake3_compress_in_place
+#define blake3_compress_subtree_wide llvm_blake3_compress_subtree_wide
 #define blake3_compress_xof llvm_blake3_compress_xof
+#define blake3_xof_many llvm_blake3_xof_many
 #define blake3_hash_many llvm_blake3_hash_many
 #define blake3_simd_degree llvm_blake3_simd_degree
 #define blake3_compress_in_place_portable llvm_blake3_compress_in_place_portable
diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.c b/llvm/lib/Support/rpmalloc/rpmalloc.c
index a06d3cdb5b52e..6f8b29e31e8ca 100644
--- a/llvm/lib/Support/rpmalloc/rpmalloc.c
+++ b/llvm/lib/Support/rpmalloc/rpmalloc.c
@@ -275,9 +275,11 @@ typedef volatile long atomic32_t;
 typedef volatile long long atomic64_t;
 typedef volatile void *atomicptr_t;
 
-static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { return *src; }
+static FORCEINLINE int32_t atomic_load32(atomic32_t *src) {
+  return (int32_t)InterlockedOr(src, 0);
+}
 static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) {
-  *dst = val;
+  InterlockedExchange(dst, val);
 }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) {
   return (int32_t)InterlockedIncrement(val);
@@ -293,20 +295,22 @@ static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val,
   return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0;
 }
 static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) {
-  *dst = val;
+  InterlockedExchange(dst, val);
+}
+static FORCEINLINE int64_t atomic_load64(atomic64_t *src) {
+  return (int64_t)InterlockedOr64(src, 0);
 }
-static FORCEINLINE int64_t atomic_load64(atomic64_t *src) { return *src; }
 static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) {
   return (int64_t)InterlockedExchangeAdd64(val, add) + add;
 }
 static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) {
-  return (void *)*src;
+  return InterlockedCompareExchangePointer(src, 0, 0);
 }
 static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) {
-  *dst = val;
+  InterlockedExchangePointer(dst, val);
 }
 static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) {
-  *dst = val;
+  InterlockedExchangePointer(dst, val);
 }
 static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst,
                                                      void *val) {
diff --git a/llvm/lib/TableGen/StringToOffsetTable.cpp b/llvm/lib/TableGen/StringToOffsetTable.cpp
index 17e1e660c15ee..41f82caa12f82 100644
--- a/llvm/lib/TableGen/StringToOffsetTable.cpp
+++ b/llvm/lib/TableGen/StringToOffsetTable.cpp
@@ -38,8 +38,8 @@ void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS,
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Woverlength-strings"
 #endif
-static constexpr char {}Storage[] = )",
-                Name);
+{} constexpr char {}{}Storage[] =)",
+                ClassPrefix.empty() ? "static" : "", ClassPrefix, Name);
 
   // MSVC silently miscompiles string literals longer than 64k in some
   // circumstances. The build system sets EmitLongStrLiterals to false when it
@@ -83,10 +83,11 @@ static constexpr char {}Storage[] = )",
 #pragma GCC diagnostic pop
 #endif
 
-static constexpr llvm::StringTable
-{0} = {0}Storage;
+{1} llvm::StringTable
+{2}{0} = {0}Storage;
 )",
-                Name);
+                Name, ClassPrefix.empty() ? "static constexpr" : "const",
+                ClassPrefix);
 }
 
 void StringToOffsetTable::EmitString(raw_ostream &O) const {
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index c8e020d791e09..aea1bb0c6d75e 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -174,129 +174,174 @@ int TGLexer::peekNextChar(int Index) const {
 }
 
 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
-  TokStart = CurPtr;
-  // This always consumes at least one character.
-  int CurChar = getNextChar();
+  while (true) {
+    TokStart = CurPtr;
+    // This always consumes at least one character.
+    int CurChar = getNextChar();
 
-  switch (CurChar) {
-  default:
-    // Handle letters: [a-zA-Z_]
-    if (isValidIDChar(CurChar, /*First=*/true))
-      return LexIdentifier();
-
-    // Unknown character, emit an error.
-    return ReturnError(TokStart, "unexpected character");
-  case EOF:
-    // Lex next token, if we just left an include file.
-    // Note that leaving an include file means that the next
-    // symbol is located at the end of the 'include "..."'
-    // construct, so LexToken() is called with default
-    // false parameter.
-    if (processEOF())
-      return LexToken();
+    switch (CurChar) {
+    default:
+      // Handle letters: [a-zA-Z_]
+      if (isValidIDChar(CurChar, /*First=*/true))
+        return LexIdentifier();
 
-    // Return EOF denoting the end of lexing.
-    return tgtok::Eof;
-
-  case ':': return tgtok::colon;
-  case ';': return tgtok::semi;
-  case ',': return tgtok::comma;
-  case '<': return tgtok::less;
-  case '>': return tgtok::greater;
-  case ']': return tgtok::r_square;
-  case '{': return tgtok::l_brace;
-  case '}': return tgtok::r_brace;
-  case '(': return tgtok::l_paren;
-  case ')': return tgtok::r_paren;
-  case '=': return tgtok::equal;
-  case '?': return tgtok::question;
-  case '#':
-    if (FileOrLineStart) {
-      tgtok::TokKind Kind = prepIsDirective();
-      if (Kind != tgtok::Error)
-        return lexPreprocessor(Kind);
-    }
+      // Unknown character, emit an error.
+      return ReturnError(TokStart, "unexpected character");
+    case EOF:
+      // Lex next token, if we just left an include file.
+      if (processEOF()) {
+        // Leaving an include file means that the next symbol is located at the
+        // end of the 'include "..."' construct.
+        FileOrLineStart = false;
+        break;
+      }
 
-    return tgtok::paste;
+      // Return EOF denoting the end of lexing.
+      return tgtok::Eof;
+
+    case ':':
+      return tgtok::colon;
+    case ';':
+      return tgtok::semi;
+    case ',':
+      return tgtok::comma;
+    case '<':
+      return tgtok::less;
+    case '>':
+      return tgtok::greater;
+    case ']':
+      return tgtok::r_square;
+    case '{':
+      return tgtok::l_brace;
+    case '}':
+      return tgtok::r_brace;
+    case '(':
+      return tgtok::l_paren;
+    case ')':
+      return tgtok::r_paren;
+    case '=':
+      return tgtok::equal;
+    case '?':
+      return tgtok::question;
+    case '#':
+      if (FileOrLineStart) {
+        tgtok::TokKind Kind = prepIsDirective();
+        if (Kind != tgtok::Error)
+          return lexPreprocessor(Kind);
+      }
+
+      return tgtok::paste;
 
-  // The period is a separate case so we can recognize the "..."
-  // range punctuator.
-  case '.':
-    if (peekNextChar(0) == '.') {
-      ++CurPtr; // Eat second dot.
+      // The period is a separate case so we can recognize the "..."
+      // range punctuator.
+    case '.':
       if (peekNextChar(0) == '.') {
-        ++CurPtr; // Eat third dot.
-        return tgtok::dotdotdot;
+        ++CurPtr; // Eat second dot.
+        if (peekNextChar(0) == '.') {
+          ++CurPtr; // Eat third dot.
+          return tgtok::dotdotdot;
+        }
+        return ReturnError(TokStart, "invalid '..' punctuation");
       }
-      return ReturnError(TokStart, "invalid '..' punctuation");
-    }
-    return tgtok::dot;
+      return tgtok::dot;
 
-  case '\r':
-    llvm_unreachable("getNextChar() must never return '\r'");
+    case '\r':
+      llvm_unreachable("getNextChar() must never return '\r'");
 
-  case ' ':
-  case '\t':
-    // Ignore whitespace.
-    return LexToken(FileOrLineStart);
-  case '\n':
-    // Ignore whitespace, and identify the new line.
-    return LexToken(true);
-  case '/':
-    // If this is the start of a // comment, skip until the end of the line or
-    // the end of the buffer.
-    if (*CurPtr == '/')
-      SkipBCPLComment();
-    else if (*CurPtr == '*') {
-      if (SkipCComment())
-        return tgtok::Error;
-    } else // Otherwise, this is an error.
-      return ReturnError(TokStart, "unexpected character");
-    return LexToken(FileOrLineStart);
-  case '-': case '+':
-  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
-  case '7': case '8': case '9': {
-    int NextChar = 0;
-    if (isDigit(CurChar)) {
-      // Allow identifiers to start with a number if it is followed by
-      // an identifier. This can happen with paste operations like
-      // foo#8i.
-      int i = 0;
-      do {
-        NextChar = peekNextChar(i++);
-      } while (isDigit(NextChar));
-
-      if (NextChar == 'x' || NextChar == 'b') {
-        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
-        // likely a number.
-        int NextNextChar = peekNextChar(i);
-        switch (NextNextChar) {
-        default:
-          break;
-        case '0': case '1':
-          if (NextChar == 'b')
-            return LexNumber();
-          [[fallthrough]];
-        case '2': case '3': case '4': case '5':
-        case '6': case '7': case '8': case '9':
-        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-          if (NextChar == 'x')
-            return LexNumber();
-          break;
+    case ' ':
+    case '\t':
+      // Ignore whitespace.
+      break;
+    case '\n':
+      // Ignore whitespace, and identify the new line.
+      FileOrLineStart = true;
+      break;
+    case '/':
+      // If this is the start of a // comment, skip until the end of the line or
+      // the end of the buffer.
+      if (*CurPtr == '/')
+        SkipBCPLComment();
+      else if (*CurPtr == '*') {
+        if (SkipCComment())
+          return tgtok::Error;
+      } else // Otherwise, this is an error.
+        return ReturnError(TokStart, "unexpected character");
+      break;
+    case '-':
+    case '+':
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9': {
+      int NextChar = 0;
+      if (isDigit(CurChar)) {
+        // Allow identifiers to start with a number if it is followed by
+        // an identifier.  This can happen with paste operations like
+        // foo#8i.
+        int i = 0;
+        do {
+          NextChar = peekNextChar(i++);
+        } while (isDigit(NextChar));
+
+        if (NextChar == 'x' || NextChar == 'b') {
+          // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
+          // likely a number.
+          int NextNextChar = peekNextChar(i);
+          switch (NextNextChar) {
+          default:
+            break;
+          case '0':
+          case '1':
+            if (NextChar == 'b')
+              return LexNumber();
+            [[fallthrough]];
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+          case 'a':
+          case 'b':
+          case 'c':
+          case 'd':
+          case 'e':
+          case 'f':
+          case 'A':
+          case 'B':
+          case 'C':
+          case 'D':
+          case 'E':
+          case 'F':
+            if (NextChar == 'x')
+              return LexNumber();
+            break;
+          }
         }
       }
-    }
 
-    if (isValidIDChar(NextChar, /*First=*/true))
-      return LexIdentifier();
+      if (isValidIDChar(NextChar, /*First=*/true))
+        return LexIdentifier();
 
-    return LexNumber();
-  }
-  case '"': return LexString();
-  case '$': return LexVarName();
-  case '[': return LexBracket();
-  case '!': return LexExclaim();
+      return LexNumber();
+    }
+    case '"':
+      return LexString();
+    case '$':
+      return LexVarName();
+    case '[':
+      return LexBracket();
+    case '!':
+      return LexExclaim();
+    }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 36f3a670808d4..12fc976a70ea7 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     llvm_unreachable("Unsupported ElementSize");
   }
 
+  // Preserve undef state until DOP's reg is defined.
+  unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0;
+
   //
   // Create the destructive operation (if required)
   //
@@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
                .addReg(DstReg, RegState::Define)
                .addReg(MI.getOperand(PredIdx).getReg())
-               .addReg(MI.getOperand(DOPIdx).getReg());
+               .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
 
     // After the movprfx, the destructive operand is same as Dst
     DOPIdx = 0;
+    DOPRegState = 0;
 
     // Create the additional LSL to zero the lanes when the DstReg is not
     // unique. Zeros the lanes in z0 that aren't active in p0 with sequence
@@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     assert(DOPRegIsUnique && "The destructive operand should be unique");
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
                .addReg(DstReg, RegState::Define)
-               .addReg(MI.getOperand(DOPIdx).getReg());
+               .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState);
     DOPIdx = 0;
+    DOPRegState = 0;
   }
 
   //
@@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   //
   DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
     .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+  DOPRegState = DOPRegState | RegState::Kill;
 
   switch (DType) {
   case AArch64::DestructiveUnaryPassthru:
-    DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+    DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
         .add(MI.getOperand(PredIdx))
         .add(MI.getOperand(SrcIdx));
     break;
@@ -659,20 +665,20 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
     DOP.add(MI.getOperand(PredIdx))
-       .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
-       .add(MI.getOperand(SrcIdx));
+        .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
+        .add(MI.getOperand(SrcIdx));
     break;
   case AArch64::DestructiveTernaryCommWithRev:
     DOP.add(MI.getOperand(PredIdx))
-        .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+        .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState)
         .add(MI.getOperand(SrcIdx))
         .add(MI.getOperand(Src2Idx));
     break;
   }
 
   if (PRFX) {
-    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
     transferImpOps(MI, PRFX, DOP);
+    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
   } else
     transferImpOps(MI, DOP, DOP);
 
@@ -1591,18 +1597,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
          "Non-writeback variants of STGloop / STZGloop should not "
          "survive past PrologEpilogInserter.");
    case AArch64::STR_ZZZZXI:
+   case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
    case AArch64::STR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
    case AArch64::STR_ZZXI:
+   case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
    case AArch64::STR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
    case AArch64::LDR_ZZZZXI:
+   case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
    case AArch64::LDR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
    case AArch64::LDR_ZZXI:
+   case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
    case AArch64::LDR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 9973df865ea17..c1c1f0a1024d0 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
                                                "HasDisableFastIncVL", "true",
                                                "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
 
+// On most processors we want to avoid moving from WZR to vector registers
+// (relying on materializing 0 to a FPR and moving from there instead),
+// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
+def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
+                                              "UseWzrToVecMove", "true",
+                                              "Move from WZR to insert 0 into vector registers">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bde4ba993f69e..d04e6c45e2103 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1143,6 +1143,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                        ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
                        ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
                        ISD::STORE, ISD::BUILD_VECTOR});
+  setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::LOAD);
 
@@ -2392,6 +2393,15 @@ static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
   return false;
 }
 
+bool isVectorizedBinOp(unsigned Opcode) {
+  switch (Opcode) {
+  case AArch64ISD::SQDMULH:
+    return true;
+  default:
+    return false;
+  }
+}
+
 // isOpcWithIntImmediate - This method tests to see if the node is a specific
 // opcode and that it has a immediate integer right operand.
 // If so Imm will receive the value.
@@ -17145,7 +17155,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -17153,6 +17163,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(Load);
+  if (!LI)
+    return false;
+  assert(!Mask && "Unexpected mask on a load");
+
   const DataLayout &DL = LI->getDataLayout();
 
   VectorType *VTy = Shuffles[0]->getType();
@@ -17476,16 +17491,18 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const {
-  unsigned Factor = DeinterleavedValues.size();
+    Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   if (Factor != 2 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
     return false;
   }
+  auto *LI = dyn_cast<LoadInst>(Load);
+  if (!LI)
+    return false;
+  assert(!Mask && "Unexpected mask on a load\n");
 
-  Value *FirstActive = *llvm::find_if(DeinterleavedValues,
-                                      [](Value *V) { return V != nullptr; });
-  VectorType *VTy = cast<VectorType>(FirstActive->getType());
+  VectorType *VTy = getDeinterleavedVectorType(DI);
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
   bool UseScalable;
@@ -17513,6 +17530,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
         Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
 
   Value *BaseAddr = LI->getPointerOperand();
+  Value *Result = nullptr;
   if (NumLoads > 1) {
     // Create multiple legal small ldN.
     SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
@@ -17533,35 +17551,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
       }
       LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
     }
-    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
-    for (unsigned J = 0; J < Factor; ++J) {
-      if (DeinterleavedValues[J])
-        DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
-    }
+
+    // Merge the values from different factors.
+    Result = PoisonValue::get(DI->getType());
+    for (unsigned J = 0; J < Factor; ++J)
+      Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
   } else {
-    Value *Result;
     if (UseScalable)
       Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
     else
       Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
-    // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
-    for (unsigned I = 0; I < Factor; I++) {
-      if (DeinterleavedValues[I]) {
-        Value *NewExtract = Builder.CreateExtractValue(Result, I);
-        DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
-      }
-    }
   }
+
+  // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+  DI->replaceAllUsesWith(Result);
   return true;
 }
 
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
-    StoreInst *SI, ArrayRef<Value *> InterleavedValues) const {
+    Instruction *Store, Value *Mask,
+    ArrayRef<Value *> InterleavedValues) const {
   unsigned Factor = InterleavedValues.size();
   if (Factor != 2 && Factor != 4) {
     LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
     return false;
   }
+  StoreInst *SI = dyn_cast<StoreInst>(Store);
+  if (!SI)
+    return false;
+  assert(!Mask && "Unexpected mask on plain store");
 
   VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
   const DataLayout &DL = SI->getModule()->getDataLayout();
@@ -20126,8 +20144,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // size, combine into an binop of two contacts of the source vectors. eg:
   // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
   if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
-      DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
-      N1->hasOneUse()) {
+      (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
+       isVectorizedBinOp(N0Opc)) &&
+      N0->hasOneUse() && N1->hasOneUse()) {
     SDValue N00 = N0->getOperand(0);
     SDValue N01 = N0->getOperand(1);
     SDValue N10 = N1->getOperand(0);
@@ -20986,6 +21005,98 @@ static SDValue performBuildVectorCombine(SDNode *N,
   return SDValue();
 }
 
+// A special combine for the sqdmulh family of instructions.
+// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
+// SATURATING_VAL ) can be reduced to sqdmulh(...)
+static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
+
+  if (N->getOpcode() != ISD::SMIN)
+    return SDValue();
+
+  EVT DestVT = N->getValueType(0);
+
+  if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
+      DestVT.isScalableVector())
+    return SDValue();
+
+  ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
+
+  if (!Clamp)
+    return SDValue();
+
+  MVT ScalarType;
+  unsigned ShiftAmt = 0;
+  switch (Clamp->getSExtValue()) {
+  case (1ULL << 15) - 1:
+    ScalarType = MVT::i16;
+    ShiftAmt = 16;
+    break;
+  case (1ULL << 31) - 1:
+    ScalarType = MVT::i32;
+    ShiftAmt = 32;
+    break;
+  default:
+    return SDValue();
+  }
+
+  SDValue Sra = N->getOperand(0);
+  if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
+    return SDValue();
+
+  ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
+  if (!RightShiftVec)
+    return SDValue();
+  unsigned SExtValue = RightShiftVec->getSExtValue();
+
+  if (SExtValue != (ShiftAmt - 1))
+    return SDValue();
+
+  SDValue Mul = Sra.getOperand(0);
+  if (Mul.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  SDValue SExt0 = Mul.getOperand(0);
+  SDValue SExt1 = Mul.getOperand(1);
+
+  if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
+      SExt1.getOpcode() != ISD::SIGN_EXTEND)
+    return SDValue();
+
+  EVT SExt0Type = SExt0.getOperand(0).getValueType();
+  EVT SExt1Type = SExt1.getOperand(0).getValueType();
+
+  if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
+      SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
+      SExt0Type.getVectorNumElements() == 1)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue V0 = SExt0.getOperand(0);
+  SDValue V1 = SExt1.getOperand(0);
+
+  // Ensure input vectors are extended to legal types
+  if (SExt0Type.getFixedSizeInBits() < 64) {
+    unsigned VecNumElements = SExt0Type.getVectorNumElements();
+    EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
+                                    VecNumElements);
+    V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
+    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
+  }
+
+  SDValue SQDMULH =
+      DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
+
+  return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
+}
+
+static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) {
+  if (SDValue V = trySQDMULHCombine(N, DAG)) {
+    return V;
+  }
+
+  return SDValue();
+}
+
 static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc DL(N);
@@ -26737,6 +26848,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performAddSubCombine(N, DCI);
   case ISD::BUILD_VECTOR:
     return performBuildVectorCombine(N, DCI, DAG);
+  case ISD::SMIN:
+    return performSMINCombine(N, DAG);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DAG, DCI);
   case AArch64ISD::ANDS:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 65fe08e92c235..713793ec77da3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -211,18 +211,19 @@ class AArch64TargetLowering : public TargetLowering {
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(
-      LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+                                        IntrinsicInst *DI) const override;
 
   bool lowerInterleaveIntrinsicToStore(
-      StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
+      Instruction *Store, Value *Mask,
+      ArrayRef<Value *> InterleaveValues) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa76..bc57537ad5dfb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -2482,8 +2484,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LDR_PXI:
   case AArch64::LDR_ZXI:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZXI:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDRBBui:
   case AArch64::LDRBui:
   case AArch64::LDRDui:
@@ -2525,8 +2529,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STR_PXI:
   case AArch64::STR_ZXI:
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STR_ZZZXI:
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STRBBui:
   case AArch64::STRBui:
   case AArch64::STRDui:
@@ -4318,7 +4324,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     break;
   // SVE
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 4);
     MinOffset = -256;
@@ -4332,7 +4340,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MaxOffset = 253;
     break;
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 2);
     MinOffset = -256;
@@ -5559,8 +5569,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZXI;
@@ -5584,8 +5598,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZZZXI;
@@ -5736,8 +5754,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZXI;
@@ -5761,8 +5783,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZZZXI;
@@ -6264,13 +6290,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
     //
     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
-      const TargetRegisterClass *FillRC;
+      const TargetRegisterClass *FillRC = nullptr;
       switch (DstMO.getSubReg()) {
       default:
-        FillRC = nullptr;
         break;
       case AArch64::sub_32:
-        FillRC = &AArch64::GPR32RegClass;
+        if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
+          FillRC = &AArch64::GPR32RegClass;
         break;
       case AArch64::ssub:
         FillRC = &AArch64::FPR32RegClass;
@@ -7327,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7367,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+static bool getGatherPattern(MachineInstr &Root,
+                             SmallVectorImpl<unsigned> &Patterns,
+                             unsigned LoadLaneOpCode, unsigned NumLanes) {
+  const MachineFunction *MF = Root.getMF();
+
+  // Early exit if optimizing for size.
+  if (MF->getFunction().hasMinSize())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != NumLanes - 1)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  // For each load we also want to check that:
+  // 1. It has a single non-debug use (since we will be replacing the virtual
+  // register)
+  // 2. That the addressing mode only uses a single offset register.
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+  SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
+  while (!RemainingLanes.empty() && CurrInstr &&
+         CurrInstr->getOpcode() == LoadLaneOpCode &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+         CurrInstr->getNumOperands() == 4) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an integer into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  unsigned SingleLaneSizeInBits = 128 / NumLanes;
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  switch (NumLanes) {
+  case 4:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+    break;
+  case 8:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+    break;
+  case 16:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+    break;
+  default:
+    llvm_unreachable("Got bad number of lanes for gather pattern.");
+  }
+
+  return true;
+}
+
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+
+  // The pattern searches for loads into single lanes.
+  switch (Root.getOpcode()) {
+  case AArch64::LD1i32:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+  case AArch64::LD1i16:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+  case AArch64::LD1i8:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+  default:
+    return false;
+  }
+}
+
+static void
+generateGatherPattern(MachineInstr &Root,
+                      SmallVectorImpl<MachineInstr *> &InsInstrs,
+                      SmallVectorImpl<MachineInstr *> &DelInstrs,
+                      DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+                      unsigned Pattern, unsigned NumLanes) {
+
+  MachineFunction &MF = *Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  // Gather the initial load instructions to build the pattern
+  SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+  MachineInstr *CurrInstr = &Root;
+  for (unsigned i = 0; i < NumLanes - 1; ++i) {
+    LoadToLaneInstrs.push_back(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  // Sort the load instructions according to the lane.
+  llvm::sort(LoadToLaneInstrs,
+             [](const MachineInstr *A, const MachineInstr *B) {
+               return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+             });
+
+  MachineInstr *SubregToReg = CurrInstr;
+  LoadToLaneInstrs.push_back(
+      MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+  auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+  const TargetRegisterClass *FPR128RegClass =
+      MRI.getRegClass(Root.getOperand(0).getReg());
+
+  auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+                                Register SrcRegister, unsigned Lane,
+                                Register OffsetRegister) {
+    auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+    MachineInstrBuilder LoadIndexIntoRegister =
+        BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+                NewRegister)
+            .addReg(SrcRegister)
+            .addImm(Lane)
+            .addReg(OffsetRegister, getKillRegState(true));
+    InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+    InsInstrs.push_back(LoadIndexIntoRegister);
+    return NewRegister;
+  };
+
+  // Helper to create load instruction based on opcode
+  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+                                   Register OffsetReg) -> MachineInstrBuilder {
+    unsigned Opcode;
+    switch (NumLanes) {
+    case 4:
+      Opcode = AArch64::LDRSui;
+      break;
+    case 8:
+      Opcode = AArch64::LDRHui;
+      break;
+    case 16:
+      Opcode = AArch64::LDRBui;
+      break;
+    default:
+      llvm_unreachable(
+          "Got unsupported number of lanes in machine-combiner gather pattern");
+    }
+    // Immediate offset load
+    return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+        .addReg(OffsetReg)
+        .addImm(0); // immediate offset
+  };
+
+  // Load the remaining lanes into register 0.
+  auto LanesToLoadToReg0 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+                       LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+  auto PrevReg = SubregToReg->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg0 = PrevReg;
+
+  // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+  // a single instruction.
+  auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+  auto OriginalSplitLoad =
+      *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+  auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+      MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+  MachineInstrBuilder MiddleIndexLoadInstr =
+      CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+                            OriginalSplitLoad->getOperand(3).getReg());
+
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+  InsInstrs.push_back(MiddleIndexLoadInstr);
+  DelInstrs.push_back(OriginalSplitLoad);
+
+  // Subreg To Reg instruction for register 1.
+  auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+  unsigned SubregType;
+  switch (NumLanes) {
+  case 4:
+    SubregType = AArch64::ssub;
+    break;
+  case 8:
+    SubregType = AArch64::hsub;
+    break;
+  case 16:
+    SubregType = AArch64::bsub;
+    break;
+  default:
+    llvm_unreachable(
+        "Got invalid NumLanes for machine-combiner gather pattern");
+  }
+
+  auto SubRegToRegInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+              DestRegForSubregToReg)
+          .addImm(0)
+          .addReg(DestRegForMiddleIndex, getKillRegState(true))
+          .addImm(SubregType);
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+  InsInstrs.push_back(SubRegToRegInstr);
+
+  // Load remaining lanes into register 1.
+  auto LanesToLoadToReg1 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+                       LoadToLaneInstrsAscending.end());
+  PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
+    if (Index == NumLanes / 2 - 2) {
+      break;
+    }
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg1 = PrevReg;
+
+  // Create the final zip instruction to combine the results.
+  MachineInstrBuilder ZipInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+              Root.getOperand(0).getReg())
+          .addReg(LastLoadReg0)
+          .addReg(LastLoadReg1);
+  InsInstrs.push_back(ZipInstr);
+}
+
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7401,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
+  // Load patterns
+  if (getLoadPatterns(Root, Patterns))
+    return true;
+
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8656,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 4);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 8);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 16);
+    break;
+  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
@@ -9561,10 +9850,15 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
       };
   auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
     // At least one unsafe register is not dead. We do not want to outline at
-    // this point. If it is long enough to outline from, save the range
-    // [RangeBegin, RangeEnd).
-    if (RangeLen > 1)
-      Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
+    // this point. If it is long enough to outline from and does not cross a
+    // bundle boundary, save the range [RangeBegin, RangeEnd).
+    if (RangeLen <= 1)
+      return;
+    if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
+      return;
+    if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
+      return;
+    Ranges.emplace_back(RangeBegin, RangeEnd);
   };
   // Find the first point where all unsafe registers are dead.
   // FIND: <safe instr> <-- end of first potential range
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..02734866e7122 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
+
+  GATHER_LANE_i32,
+  GATHER_LANE_i16,
+  GATHER_LANE_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddc685fae5e9a..6c46b18d506c5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
 
 def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
 
+def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
+
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -1022,6 +1024,7 @@ def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
                              [SDNPCommutative]>;
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
                              [SDNPCommutative]>;
+def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>;
 
 // Reciprocal estimates and steps.
 def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
@@ -7376,6 +7379,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
               (i64 0)),
             dsub)>;
 
+let Predicates = [UseWzrToVecMove] in {
 def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
           (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
 def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
@@ -7386,6 +7390,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm))
           (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
 def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
           (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
+}
 
 def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
             (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
@@ -9439,6 +9444,15 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
                              (EXTRACT_SUBREG V128:$Rm, dsub)),
            (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
 
+def : Pat<(v4i16 (AArch64sqdmulh (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+          (SQDMULHv4i16 V64:$Rn, V64:$Rm)>;
+def : Pat<(v2i32 (AArch64sqdmulh (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+          (SQDMULHv2i32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v8i16 (AArch64sqdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm))),
+          (SQDMULHv8i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
+          (SQDMULHv4i32 V128:$Rn, V128:$Rm)>;
+
 // Conversions within AdvSIMD types in the same register size are free.
 // But because we need a consistent lane ordering, in big endian many
 // conversions require one or more REV instructions.
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 3b4c7a05d1d4b..782d62a7e5e13 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1666,7 +1666,7 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
          "Given Opc should be a Load or Store with an immediate");
   // OpcA will be the first instruction in the pair.
   if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
-    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
+    Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
     return true;
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 5379305bc7a7f..adc984ad795af 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
                                    "Cortex-A320 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureBalanceFPOps,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA55     : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
                                    "Cortex-A55 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
-                                   FeatureFuseAddress]>;
+                                   FeatureFuseAddress,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    "Cortex-A510 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove
                                    ]>;
 
 def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520AE ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureUseWzrToVecMove]>;
 
 def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", [
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eddb96979f7b8..0c4b4f4c3ed88 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in {
   // These get expanded to individual LDR_ZXI/STR_ZXI instructions in
   // AArch64ExpandPseudoInsts.
   let mayLoad = 1, hasSideEffects = 0 in {
-    def LDR_ZZXI   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def LDR_ZZXI   : Pseudo<(outs   ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_PPXI   : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_PPXI   : Pseudo<(outs   PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
   let mayStore = 1, hasSideEffects = 0 in {
-    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_PPXI   : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_PPXI   : Pseudo<(outs), (ins   PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
 
   let AddedComplexity = 1 in {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 0956823346795..2409cc862f21c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -270,6 +270,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     break;
   case NeoverseV2:
   case NeoverseV3:
+    CacheLineSize = 64;
     EpilogueVectorizationMinVF = 8;
     MaxInterleaveFactor = 4;
     ScatterOverhead = 13;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8904402065696..90d3d92d6bbf5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2674,14 +2674,14 @@ static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
 static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
                                                    IntrinsicInst &II) {
   // If this barrier is post-dominated by identical one we can remove it
-  auto *NI = II.getNextNonDebugInstruction();
+  auto *NI = II.getNextNode();
   unsigned LookaheadThreshold = DMBLookaheadThreshold;
   auto CanSkipOver = [](Instruction *I) {
     return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
   };
   while (LookaheadThreshold-- && CanSkipOver(NI)) {
     auto *NIBB = NI->getParent();
-    NI = NI->getNextNonDebugInstruction();
+    NI = NI->getNextNode();
     if (!NI) {
       if (auto *SuccBB = NIBB->getUniqueSuccessor())
         NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
@@ -3724,7 +3724,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
 
 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-    bool HasRealUse, const Instruction *I, Value *Scalar,
+    const Instruction *I, Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
@@ -3744,12 +3744,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     }
 
     // The element at index zero is already inside the vector.
-    // - For a physical (HasRealUse==true) insert-element or extract-element
+    // - For a insert-element or extract-element
     // instruction that extracts integers, an explicit FPR -> GPR move is
     // needed. So it has non-zero cost.
-    // - For the rest of cases (virtual instruction or element type is float),
-    // consider the instruction free.
-    if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
+    if (Index == 0 && !Val->getScalarType()->isIntegerTy())
       return 0;
 
     // This is recognising a LD1 single-element structure to one lane of one
@@ -3899,25 +3897,28 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    unsigned Index,
                                                    const Value *Op0,
                                                    const Value *Op1) const {
-  bool HasRealUse =
-      Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
+  // Treat insert at lane 0 into a poison vector as having zero cost. This
+  // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
+  // single dup) are treated as cheap.
+  if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
+      isa<PoisonValue>(Op0))
+    return 0;
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
-                                  Scalar, ScalarUserAndIdx);
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
+                                  ScalarUserAndIdx);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                    Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index) const {
-  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
-                                  true /* HasRealUse */, &I);
+  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
 }
 
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
@@ -4126,10 +4127,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
         // SDIV/UDIV operations are lowered using SVE, then we can have less
         // costs.
-        if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
-                                                ->getPrimitiveSizeInBits()
-                                                .getFixedValue() < 128) {
-          EVT VT = TLI->getValueType(DL, Ty);
+        if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
+            Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
           static const CostTblEntry DivTbl[]{
               {ISD::SDIV, MVT::v2i8, 5},  {ISD::SDIV, MVT::v4i8, 8},
               {ISD::SDIV, MVT::v8i8, 8},  {ISD::SDIV, MVT::v2i16, 5},
@@ -5212,34 +5211,34 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
   // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
   static const CostTblEntry CostTblNoPairwise[]{
-      {ISD::ADD, MVT::v8i8,   2},
-      {ISD::ADD, MVT::v16i8,  2},
-      {ISD::ADD, MVT::v4i16,  2},
-      {ISD::ADD, MVT::v8i16,  2},
-      {ISD::ADD, MVT::v2i32,  2},
-      {ISD::ADD, MVT::v4i32,  2},
-      {ISD::ADD, MVT::v2i64,  2},
-      {ISD::OR,  MVT::v8i8,  15},
-      {ISD::OR,  MVT::v16i8, 17},
-      {ISD::OR,  MVT::v4i16,  7},
-      {ISD::OR,  MVT::v8i16,  9},
-      {ISD::OR,  MVT::v2i32,  3},
-      {ISD::OR,  MVT::v4i32,  5},
-      {ISD::OR,  MVT::v2i64,  3},
-      {ISD::XOR, MVT::v8i8,  15},
-      {ISD::XOR, MVT::v16i8, 17},
-      {ISD::XOR, MVT::v4i16,  7},
-      {ISD::XOR, MVT::v8i16,  9},
-      {ISD::XOR, MVT::v2i32,  3},
-      {ISD::XOR, MVT::v4i32,  5},
-      {ISD::XOR, MVT::v2i64,  3},
-      {ISD::AND, MVT::v8i8,  15},
-      {ISD::AND, MVT::v16i8, 17},
-      {ISD::AND, MVT::v4i16,  7},
-      {ISD::AND, MVT::v8i16,  9},
-      {ISD::AND, MVT::v2i32,  3},
-      {ISD::AND, MVT::v4i32,  5},
-      {ISD::AND, MVT::v2i64,  3},
+      {ISD::ADD, MVT::v8i8, 2},
+      {ISD::ADD, MVT::v16i8, 2},
+      {ISD::ADD, MVT::v4i16, 2},
+      {ISD::ADD, MVT::v8i16, 2},
+      {ISD::ADD, MVT::v2i32, 2},
+      {ISD::ADD, MVT::v4i32, 2},
+      {ISD::ADD, MVT::v2i64, 2},
+      {ISD::OR, MVT::v8i8, 5},  // fmov + orr_lsr + orr_lsr + lsr + orr
+      {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
+      {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
+      {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
+      {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
+      {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
+      {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
+      {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
+      {ISD::XOR, MVT::v16i8, 7},
+      {ISD::XOR, MVT::v4i16, 4},
+      {ISD::XOR, MVT::v8i16, 6},
+      {ISD::XOR, MVT::v2i32, 3},
+      {ISD::XOR, MVT::v4i32, 5},
+      {ISD::XOR, MVT::v2i64, 3},
+      {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
+      {ISD::AND, MVT::v16i8, 7},
+      {ISD::AND, MVT::v4i16, 4},
+      {ISD::AND, MVT::v8i16, 6},
+      {ISD::AND, MVT::v2i32, 3},
+      {ISD::AND, MVT::v4i32, 5},
+      {ISD::AND, MVT::v2i64, 3},
   };
   switch (ISD) {
   default:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ff0ab68a16a88..b27eb2ef7a39d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
 
   // A helper function called by 'getVectorInstrCost'.
   //
-  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
-  // indicates whether the vector instruction is available in the input IR or
-  // just imaginary in vectorizer passes.
-  /// \param ScalarUserAndIdx encodes the information about extracts from a
+  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
+  // \param ScalarUserAndIdx encodes the information about extracts from a
   /// vector with 'Scalar' being the value being extracted,'User' being the user
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCostHelper(
       unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-      bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
+      const Instruction *I = nullptr, Value *Scalar = nullptr,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
 
 public:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 84884d98e6f9c..b9d3e1bf835b4 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -142,7 +142,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
                                  uint64_t Value, MCContext &Ctx,
                                  const Triple &TheTriple, bool IsResolved) {
   int64_t SignedValue = static_cast<int64_t>(Value);
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
@@ -417,7 +417,7 @@ static bool shouldForceRelocation(const MCFixup &Fixup) {
   // same page as the ADRP and the instruction should encode 0x0. Assuming the
   // section isn't 0x1000-aligned, we therefore need to delegate this decision
   // to the linker -- a relocation!
-  return Fixup.getTargetKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21;
+  return Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21;
 }
 
 void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
@@ -431,7 +431,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
   if (mc::isRelocation(Kind))
     return;
 
-  if (Fixup.getTargetKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) {
+  if (Fixup.getKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) {
     auto RefKind = static_cast<AArch64::Specifier>(Target.getSpecifier());
     AArch64::Specifier SymLoc = AArch64::getSymbolLoc(RefKind);
     if (SymLoc == AArch64::S_AUTH || SymLoc == AArch64::S_AUTHADDR) {
@@ -488,7 +488,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
   AArch64::Specifier RefKind =
       static_cast<AArch64::Specifier>(Target.getSpecifier());
   if (AArch64::getSymbolLoc(RefKind) == AArch64::S_SABS ||
-      (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) {
+      (!RefKind && Fixup.getKind() == AArch64::fixup_aarch64_movw)) {
     // If the immediate is negative, generate MOVN else MOVZ.
     // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
     if (SignedValue < 0)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index c3881fc79ba62..7618a57691868 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32)
 // assumes IsILP32 is true
 bool AArch64ELFObjectWriter::isNonILP32reloc(const MCFixup &Fixup,
                                              AArch64::Specifier RefKind) const {
-  if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw)
+  if (Fixup.getKind() != AArch64::fixup_aarch64_movw)
     return false;
   switch (RefKind) {
   case AArch64::S_ABS_G3:
@@ -84,7 +84,7 @@ bool AArch64ELFObjectWriter::isNonILP32reloc(const MCFixup &Fixup,
 unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                               const MCValue &Target,
                                               bool IsPCRel) const {
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
   AArch64::Specifier RefKind =
       static_cast<AArch64::Specifier>(Target.getSpecifier());
   AArch64::Specifier SymLoc = AArch64::getSymbolLoc(RefKind);
@@ -212,7 +212,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
   } else {
     if (IsILP32 && isNonILP32reloc(Fixup, RefKind))
       return ELF::R_AARCH64_NONE;
-    switch (Fixup.getTargetKind()) {
+    switch (Fixup.getKind()) {
     case FK_Data_1:
       reportError(Fixup.getLoc(), "1-byte data relocations not supported");
       return ELF::R_AARCH64_NONE;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index f2144375fd95e..08f547a85073e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -529,11 +529,9 @@ void AArch64TargetELFStreamer::finish() {
         static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
     bool Empty = true;
     for (auto &F : *Text) {
-      if (auto *DF = dyn_cast<MCDataFragment>(&F)) {
-        if (!DF->getContents().empty()) {
-          Empty = false;
-          break;
-        }
+      if (F.getSize()) {
+        Empty = false;
+        break;
       }
     }
     if (Empty)
@@ -561,8 +559,7 @@ void AArch64TargetELFStreamer::finish() {
     if (!Sym.isMemtag())
       continue;
     auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx);
-    (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(),
-                               *Ctx.getSubtargetInfo());
+    S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index f918e3cbc7b80..5c8f57664a2cc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -356,7 +356,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   else if (TheTriple.isOSBinFormatCOFF())
     MAI = new AArch64MCAsmInfoGNUCOFF();
   else
-    llvm_unreachable("Invalid target"); // FIXME: This is not unreachable
+    reportFatalUsageError("unsupported object format");
 
   // Initial state of the frame pointer is SP.
   unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 61458d7c24be0..1ac340a1b58a2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -53,7 +53,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
   Log2Size = ~0U;
 
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 23f106a9c1d4d..007b481f84960 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -153,6 +153,9 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
+extern char &AMDGPUPrepareAGPRAllocLegacyID;
+
 void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &);
 extern char &AMDGPUReserveWWMRegsLegacyID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 04ec1716478e4..0e0e83b7a6b54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
   "Use scratch_* flat memory instructions to access scratch"
 >;
 
+def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
+  "FlatGVSMode",
+  "true",
+  "Have GVS addressing mode with flat_* instructions"
+>;
+
 def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
   "AddNoCarryInsts",
   "true",
@@ -1112,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
   "Has v_bitop3_b32/v_bitop3_b16 instructions"
 >;
 
+def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
+  "HasTanhInsts",
+  "true",
+  "Has v_tanh_f32/f16 instructions"
+>;
+
 def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
   "HasTransposeLoadF4F6Insts",
   "true",
@@ -1954,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureShaderCyclesHiLoRegisters,
    FeatureArchitectedFlatScratch,
    FeatureArchitectedSGPRs,
+   FeatureFlatGVSMode,
    FeatureAtomicFaddRtnInsts,
    FeatureAtomicFaddNoRtnInsts,
    FeatureAtomicDsPkAdd16Insts,
@@ -1972,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureScalarDwordx3Loads,
    FeatureDPPSrc1SGPR,
    FeatureBitOp3Insts,
+   FeatureTanhInsts,
    FeatureTransposeLoadF4F6Insts,
    FeatureBF16TransInsts,
    FeatureBF16ConversionInsts,
@@ -2381,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">,
 def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">,
   AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>;
 
+def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">,
+  AssemblerPredicate<(all_of FeatureFlatGVSMode)>;
+
 def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">,
   AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>;
 
@@ -2667,6 +2684,9 @@ def HasDefaultComponentBroadcast
 def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
   AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
 
+def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">,
+  AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+
 def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
 
 def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
@@ -2690,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
 def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
   AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
 
+def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
+  AssemblerPredicate<(all_of FeatureTanhInsts)>;
+
 def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
   AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 79cf49f88d6dc..dedee46a44237 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,11 +13,9 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Analysis/CycleAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/Attributor.h"
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 22b921fb2084f..5f1983791cfae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -45,12 +45,6 @@ static cl::opt<bool> WidenLoads(
   cl::ReallyHidden,
   cl::init(false));
 
-static cl::opt<bool> Widen16BitOps(
-    "amdgpu-codegenprepare-widen-16-bit-ops",
-    cl::desc(
-        "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
-    cl::ReallyHidden, cl::init(false));
-
 static cl::opt<bool>
     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
                    cl::desc("Break large PHI nodes for DAGISel"),
@@ -150,18 +144,6 @@ class AMDGPUCodeGenPrepareImpl
 
   bool canBreakPHINode(const PHINode &I);
 
-  /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
-  /// binary operation \p V.
-  ///
-  /// \returns Binary operation \p V.
-  /// \returns \p T's base element bit width.
-  unsigned getBaseElementBitWidth(const Type *T) const;
-
-  /// \returns Equivalent 32 bit integer type for given type \p T. For example,
-  /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
-  /// is returned.
-  Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
-
   /// \returns True if binary operation \p I is a signed binary operation, false
   /// otherwise.
   bool isSigned(const BinaryOperator &I) const;
@@ -170,10 +152,6 @@ class AMDGPUCodeGenPrepareImpl
   /// signed 'icmp' operation, false otherwise.
   bool isSigned(const SelectInst &I) const;
 
-  /// \returns True if type \p T needs to be promoted to 32 bit integer type,
-  /// false otherwise.
-  bool needsPromotionToI32(const Type *T) const;
-
   /// Return true if \p T is a legal scalar floating point type.
   bool isLegalFloatingTy(const Type *T) const;
 
@@ -188,52 +166,6 @@ class AMDGPUCodeGenPrepareImpl
            computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
   }
 
-  /// Promotes uniform binary operation \p I to equivalent 32 bit binary
-  /// operation.
-  ///
-  /// \details \p I's base element bit width must be greater than 1 and less
-  /// than or equal 16. Promotion is done by sign or zero extending operands to
-  /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
-  /// truncating the result of 32 bit binary operation back to \p I's original
-  /// type. Division operation is not promoted.
-  ///
-  /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
-  /// false otherwise.
-  bool promoteUniformOpToI32(BinaryOperator &I) const;
-
-  /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
-  ///
-  /// \details \p I's base element bit width must be greater than 1 and less
-  /// than or equal 16. Promotion is done by sign or zero extending operands to
-  /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
-  ///
-  /// \returns True.
-  bool promoteUniformOpToI32(ICmpInst &I) const;
-
-  /// Promotes uniform 'select' operation \p I to 32 bit 'select'
-  /// operation.
-  ///
-  /// \details \p I's base element bit width must be greater than 1 and less
-  /// than or equal 16. Promotion is done by sign or zero extending operands to
-  /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
-  /// result of 32 bit 'select' operation back to \p I's original type.
-  ///
-  /// \returns True.
-  bool promoteUniformOpToI32(SelectInst &I) const;
-
-  /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
-  /// intrinsic.
-  ///
-  /// \details \p I's base element bit width must be greater than 1 and less
-  /// than or equal 16. Promotion is done by zero extending the operand to 32
-  /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
-  /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
-  /// shift amount is 32 minus \p I's base element bit width), and truncating
-  /// the result of the shift operation back to \p I's original type.
-  ///
-  /// \returns True.
-  bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
-
   /// \returns The minimum number of bits needed to store the value of \Op as an
   /// unsigned integer. Truncating to this size and then zero-extending to
   /// the original will not change the value.
@@ -320,13 +252,11 @@ class AMDGPUCodeGenPrepareImpl
   bool visitInstruction(Instruction &I) { return false; }
   bool visitBinaryOperator(BinaryOperator &I);
   bool visitLoadInst(LoadInst &I);
-  bool visitICmpInst(ICmpInst &I);
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
-  bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
   bool visitSqrt(IntrinsicInst &I);
   bool run();
@@ -380,22 +310,6 @@ bool AMDGPUCodeGenPrepareImpl::run() {
   return MadeChange;
 }
 
-unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
-  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
-
-  if (T->isIntegerTy())
-    return T->getIntegerBitWidth();
-  return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
-}
-
-Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
-  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
-
-  if (T->isIntegerTy())
-    return B.getInt32Ty();
-  return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
-}
-
 bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
   return I.getOpcode() == Instruction::AShr ||
       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
@@ -406,59 +320,11 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
          cast<ICmpInst>(I.getOperand(0))->isSigned();
 }
 
-bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
-  if (!Widen16BitOps)
-    return false;
-
-  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
-  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
-    return true;
-
-  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
-    // TODO: The set of packed operations is more limited, so may want to
-    // promote some anyway.
-    if (ST.hasVOP3PInsts())
-      return false;
-
-    return needsPromotionToI32(VT->getElementType());
-  }
-
-  return false;
-}
-
 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
   return Ty->isFloatTy() || Ty->isDoubleTy() ||
          (Ty->isHalfTy() && ST.has16BitInsts());
 }
 
-// Return true if the op promoted to i32 should have nsw set.
-static bool promotedOpIsNSW(const Instruction &I) {
-  switch (I.getOpcode()) {
-  case Instruction::Shl:
-  case Instruction::Add:
-  case Instruction::Sub:
-    return true;
-  case Instruction::Mul:
-    return I.hasNoUnsignedWrap();
-  default:
-    return false;
-  }
-}
-
-// Return true if the op promoted to i32 should have nuw set.
-static bool promotedOpIsNUW(const Instruction &I) {
-  switch (I.getOpcode()) {
-  case Instruction::Shl:
-  case Instruction::Add:
-  case Instruction::Mul:
-    return true;
-  case Instruction::Sub:
-    return I.hasNoUnsignedWrap();
-  default:
-    return false;
-  }
-}
-
 bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
   Type *Ty = I.getType();
   int TySize = DL.getTypeSizeInBits(Ty);
@@ -467,134 +333,6 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
   return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
 }
 
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
-  assert(needsPromotionToI32(I.getType()) &&
-         "I does not need promotion to i32");
-
-  if (I.getOpcode() == Instruction::SDiv ||
-      I.getOpcode() == Instruction::UDiv ||
-      I.getOpcode() == Instruction::SRem ||
-      I.getOpcode() == Instruction::URem)
-    return false;
-
-  IRBuilder<> Builder(&I);
-  Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
-  Type *I32Ty = getI32Ty(Builder, I.getType());
-  Value *ExtOp0 = nullptr;
-  Value *ExtOp1 = nullptr;
-  Value *ExtRes = nullptr;
-  Value *TruncRes = nullptr;
-
-  if (isSigned(I)) {
-    ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
-    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
-  } else {
-    ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
-    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
-  }
-
-  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
-  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
-    if (promotedOpIsNSW(cast<Instruction>(I)))
-      Inst->setHasNoSignedWrap();
-
-    if (promotedOpIsNUW(cast<Instruction>(I)))
-      Inst->setHasNoUnsignedWrap();
-
-    if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
-      Inst->setIsExact(ExactOp->isExact());
-  }
-
-  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
-
-  I.replaceAllUsesWith(TruncRes);
-  I.eraseFromParent();
-
-  return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
-  assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
-         "I does not need promotion to i32");
-
-  IRBuilder<> Builder(&I);
-  Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
-  Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
-  Value *ExtOp0 = nullptr;
-  Value *ExtOp1 = nullptr;
-  Value *NewICmp  = nullptr;
-
-  if (I.isSigned()) {
-    ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
-    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
-  } else {
-    ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
-    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
-  }
-  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
-
-  I.replaceAllUsesWith(NewICmp);
-  I.eraseFromParent();
-
-  return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
-  assert(needsPromotionToI32(I.getType()) &&
-         "I does not need promotion to i32");
-
-  IRBuilder<> Builder(&I);
-  Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
-  Type *I32Ty = getI32Ty(Builder, I.getType());
-  Value *ExtOp1 = nullptr;
-  Value *ExtOp2 = nullptr;
-  Value *ExtRes = nullptr;
-  Value *TruncRes = nullptr;
-
-  if (isSigned(I)) {
-    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
-    ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
-  } else {
-    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
-    ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
-  }
-  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
-  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
-
-  I.replaceAllUsesWith(TruncRes);
-  I.eraseFromParent();
-
-  return true;
-}
-
-bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
-    IntrinsicInst &I) const {
-  assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
-         "I must be bitreverse intrinsic");
-  assert(needsPromotionToI32(I.getType()) &&
-         "I does not need promotion to i32");
-
-  IRBuilder<> Builder(&I);
-  Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
-  Type *I32Ty = getI32Ty(Builder, I.getType());
-  Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
-  Value *ExtRes =
-      Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
-  Value *LShrOp =
-      Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
-  Value *TruncRes =
-      Builder.CreateTrunc(LShrOp, I.getType());
-
-  I.replaceAllUsesWith(TruncRes);
-  I.eraseFromParent();
-
-  return true;
-}
-
 unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
   return computeKnownBits(Op, DL, AC).countMaxActiveBits();
 }
@@ -1635,10 +1373,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
   if (foldBinOpIntoSelect(I))
     return true;
 
-  if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
-      UA.isUniform(&I) && promoteUniformOpToI32(I))
-    return true;
-
   if (UseMul24Intrin && replaceMulWithMul24(I))
     return true;
   if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
@@ -1770,16 +1504,6 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
   return false;
 }
 
-bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
-  bool Changed = false;
-
-  if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
-      UA.isUniform(&I))
-    Changed |= promoteUniformOpToI32(I);
-
-  return Changed;
-}
-
 bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
   Value *Cond = I.getCondition();
   Value *TrueVal = I.getTrueValue();
@@ -1787,12 +1511,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
   Value *CmpVal;
   CmpPredicate Pred;
 
-  if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) {
-    if (UA.isUniform(&I))
-      return promoteUniformOpToI32(I);
-    return false;
-  }
-
   // Match fract pattern with nan check.
   if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
     return false;
@@ -2196,8 +1914,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
 
 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   switch (I.getIntrinsicID()) {
-  case Intrinsic::bitreverse:
-    return visitBitreverseIntrinsicInst(I);
   case Intrinsic::minnum:
   case Intrinsic::minimumnum:
   case Intrinsic::minimum:
@@ -2209,16 +1925,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   }
 }
 
-bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
-  bool Changed = false;
-
-  if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
-      UA.isUniform(&I))
-    Changed |= promoteUniformBitreverseToI32(I);
-
-  return Changed;
-}
-
 /// Match non-nan fract pattern.
 ///   minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
 ///   minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..7b5d4077e85f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -55,6 +55,14 @@ def gi_vop3pmodsneg :
     GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">,
     GIComplexPatternEquiv<VOP3PModsNeg>;
 
+def gi_vop3pmodsnegs :
+    GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">,
+    GIComplexPatternEquiv<VOP3PModsNegs>;
+
+def gi_dotiuvop3pmodsnegabs :
+    GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">,
+    GIComplexPatternEquiv<VOP3PModsNegAbs>;
+
 def gi_wmmaopselvop3pmods :
     GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
     GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
@@ -83,6 +91,10 @@ def gi_swmmacindex16 :
     GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
     GIComplexPatternEquiv<SWMMACIndex16>;
 
+def gi_swmmacindex32 :
+    GIComplexOperandMatcher<s64, "selectSWMMACIndex32">,
+    GIComplexPatternEquiv<SWMMACIndex32>;
+
 def gi_vop3opselmods :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
     GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2540921b75e5d..25672a52345cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -447,6 +447,35 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     return;
   }
 
+  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
+  if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
+      CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
+    uint64_t C = 0;
+    bool AllConst = true;
+    unsigned EltSize = EltVT.getSizeInBits();
+    for (unsigned I = 0; I < NumVectorElts; ++I) {
+      SDValue Op = N->getOperand(I);
+      if (Op.isUndef()) {
+        AllConst = false;
+        break;
+      }
+      uint64_t Val;
+      if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) {
+        Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
+      } else
+        Val = cast<ConstantSDNode>(Op)->getZExtValue();
+      C |= Val << (EltSize * I);
+    }
+    if (AllConst) {
+      SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
+      MachineSDNode *Copy =
+          CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
+      CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
+                           RegClass);
+      return;
+    }
+  }
+
   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
                                   "supported yet");
   // 32 = Max Num Vector Elements
@@ -454,7 +483,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   // 1 = Vector Register Class
   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
 
-  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
   bool IsRegSeq = true;
   unsigned NOps = N->getNumOperands();
@@ -676,7 +704,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::Constant:
   case ISD::ConstantFP: {
-    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
+        Subtarget->has64BitLiterals())
       break;
 
     uint64_t Imm;
@@ -3244,6 +3273,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
   return SelectVOP3PMods(In, Src, SrcMods, true);
 }
 
+// Select neg_lo from the i1 immediate operand.
 bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
   const ConstantSDNode *C = cast<ConstantSDNode>(In);
   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
@@ -3259,6 +3289,47 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
   return true;
 }
 
+// Select both neg_lo and neg_hi from the i1 immediate operand. This is
+// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
+// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const {
+  const ConstantSDNode *C = cast<ConstantSDNode>(In);
+  // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+  // 1 promotes packed values to signed, 0 treats them as unsigned.
+  assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
+
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned SrcSign = C->getZExtValue();
+  if (SrcSign == 1)
+    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+
+  Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+// Select neg, abs, or both neg and abs from the i16 immediate operans.
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const {
+  const ConstantSDNode *C = cast<ConstantSDNode>(In);
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned SrcMod = C->getZExtValue();
+  switch (SrcMod) {
+  default: // Any other value will be silently ignored (considered as 0).
+    break;
+  case 1:
+    Mods ^= SISrcMods::NEG;
+    break;
+  case 2:
+    Mods ^= SISrcMods::ABS;
+    break;
+  case 3:
+    Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+    break;
+  }
+
+  Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
                                                   SDValue &Src) const {
   const ConstantSDNode *C = cast<ConstantSDNode>(In);
@@ -3610,6 +3681,41 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
+                                             SDValue &IndexKey) const {
+  unsigned Key = 0;
+  Src = In;
+
+  SDValue InI32;
+
+  if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
+    const SDValue &ExtendSrc = In.getOperand(0);
+    if (ExtendSrc.getValueSizeInBits() == 32)
+      InI32 = ExtendSrc;
+  } else if (In->getOpcode() == ISD::BITCAST) {
+    const SDValue &CastSrc = In.getOperand(0);
+    if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
+        CastSrc.getOperand(0).getValueSizeInBits() == 32) {
+      ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
+      if (Zero && Zero->getZExtValue() == 0)
+        InI32 = CastSrc.getOperand(0);
+    }
+  }
+
+  if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
+    ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
+    if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
+        EltIdx->getZExtValue() == 1) {
+      Key = 1;
+      Src = ExtractVecEltSrc;
+    }
+  }
+
+  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f3b9364fdb92b..9967f46e085e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -222,6 +222,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
   bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
+  bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const;
+  bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const;
   bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
 
   bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
@@ -233,6 +235,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
   bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+  bool SelectSWMMACIndex32(SDValue In, SDValue &Src, SDValue &IndexKey) const;
 
   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3414fe758eff8..3d040fb705a8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4843,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
+// Detect when CMP and SELECT use the same constant and fold them to avoid
+// loading the constant twice. Specifically handles patterns like:
+// %cmp = icmp eq i32 %val, 4242
+// %sel = select i1 %cmp, i32 4242, i32 %other
+// It can be optimized to reuse %val instead of 4242 in select.
+static SDValue
+foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AMDGPUSubtarget *ST) {
+  SDValue Cond = N->getOperand(0);
+  SDValue TrueVal = N->getOperand(1);
+  SDValue FalseVal = N->getOperand(2);
+
+  // Check if condition is a comparison.
+  if (Cond.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  SDValue LHS = Cond.getOperand(0);
+  SDValue RHS = Cond.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+  bool isInteger = LHS.getValueType().isInteger();
+
+  // Handle simple floating-point and integer types only.
+  if (!isFloatingPoint && !isInteger)
+    return SDValue();
+
+  bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+  bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+  if (!isEquality && !isNonEquality)
+    return SDValue();
+
+  SDValue ArgVal, ConstVal;
+  if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+      (isInteger && isa<ConstantSDNode>(RHS))) {
+    ConstVal = RHS;
+    ArgVal = LHS;
+  } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+             (isInteger && isa<ConstantSDNode>(LHS))) {
+    ConstVal = LHS;
+    ArgVal = RHS;
+  } else {
+    return SDValue();
+  }
+
+  // Check if constant should not be optimized - early return if not.
+  if (isFloatingPoint) {
+    const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+    const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
+
+    // Only optimize normal floating-point values (finite, non-zero, and
+    // non-subnormal as per IEEE 754), skip optimization for inlinable
+    // floating-point constants.
+    if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
+      return SDValue();
+  } else {
+    int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
+
+    // Skip optimization for inlinable integer immediates.
+    // Inlinable immediates include: -16 to 64 (inclusive).
+    if (IntVal >= -16 && IntVal <= 64)
+      return SDValue();
+  }
+
+  // For equality and non-equality comparisons, patterns:
+  // select (setcc x, const), const, y -> select (setcc x, const), x, y
+  // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+  if (!(isEquality && TrueVal == ConstVal) &&
+      !(isNonEquality && FalseVal == ConstVal))
+    return SDValue();
+
+  SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+  SDValue SelectRHS =
+      (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+  return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+                         SelectLHS, SelectRHS);
+}
+
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
     return Folded;
 
+  // Try to fold CMP + SELECT patterns with shared constants (both FP and
+  // integer).
+  if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
+    return Folded;
+
   SDValue Cond = N->getOperand(0);
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
@@ -5734,6 +5817,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+  NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
   NODE_NAME_CASE(LDS)
   NODE_NAME_CASE(DUMMY_CHAIN)
   NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 0dd2183b72b24..4e8c6c7ea3b27 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -545,6 +545,7 @@ enum NodeType : unsigned {
   /// Pointer to the start of the shader's constant data.
   CONST_DATA_PTR,
   PC_ADD_REL_OFFSET,
+  PC_ADD_REL_OFFSET64,
   LDS,
 
   DUMMY_CHAIN,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 44eaebffb70dc..9a90787963d7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -25,6 +25,7 @@ namespace {
 
 class AMDGPUInsertDelayAlu {
 public:
+  const GCNSubtarget *ST;
   const SIInstrInfo *SII;
   const TargetRegisterInfo *TRI;
 
@@ -65,13 +66,16 @@ class AMDGPUInsertDelayAlu {
   // Types of delay that can be encoded in an s_delay_alu instruction.
   enum DelayType { VALU, TRANS, SALU, OTHER };
 
-  // Get the delay type for an instruction with the specified TSFlags.
-  static DelayType getDelayType(uint64_t TSFlags) {
-    if (TSFlags & SIInstrFlags::TRANS)
+  // Get the delay type for a MachineInstr.
+  DelayType getDelayType(const MachineInstr &MI) {
+    if (SIInstrInfo::isTRANS(MI))
       return TRANS;
-    if (TSFlags & SIInstrFlags::VALU)
+    // WMMA XDL ops are treated the same as TRANS.
+    if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
+      return TRANS;
+    if (SIInstrInfo::isVALU(MI))
       return VALU;
-    if (TSFlags & SIInstrFlags::SALU)
+    if (SIInstrInfo::isSALU(MI))
       return SALU;
     return OTHER;
   }
@@ -368,7 +372,7 @@ class AMDGPUInsertDelayAlu {
         continue;
       }
 
-      DelayType Type = getDelayType(MI.getDesc().TSFlags);
+      DelayType Type = getDelayType(MI);
 
       if (instructionWaitsForSGPRWrites(MI)) {
         auto It = State.find(LastSGPRFromVALU);
@@ -456,12 +460,12 @@ class AMDGPUInsertDelayAlu {
     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
                       << "\n");
 
-    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-    if (!ST.hasDelayAlu())
+    ST = &MF.getSubtarget<GCNSubtarget>();
+    if (!ST->hasDelayAlu())
       return false;
 
-    SII = ST.getInstrInfo();
-    TRI = ST.getRegisterInfo();
+    SII = ST->getInstrInfo();
+    TRI = ST->getRegisterInfo();
     SchedModel = &SII->getSchedModel();
 
     // Calculate the delay state for each basic block, iterating until we reach
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ea79c57080faa..1a63c48e3666c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3513,6 +3513,25 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
   return Register();
 }
 
+Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
+  Register AnyExtSrc;
+  if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
+    return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
+
+  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+    return Register();
+
+  assert(Def->getNumOperands() == 3 &&
+         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+
+  if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
+    return Def->getOperand(1).getReg();
+
+  return Register();
+}
+
 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   if (!Subtarget->hasVMemToLDSLoad())
     return false;
@@ -4904,6 +4923,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
   return selectVOP3PRetHelper(Root, true);
 }
 
+// Select neg_lo from the i1 immediate operand.
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
   // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
@@ -4919,6 +4939,50 @@ AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
   }};
 }
 
+// Select both neg_lo and neg_hi from the i1 immediate operand. This is
+// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies
+// to matrix's even k elements, and neg_hi applies to matrix's odd k elements.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const {
+  // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
+  // Value is in Imm operand as i1 sign extended to int64_t.
+  // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
+  assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
+         "expected i1 value");
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  if (Root.getImm() == -1)
+    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+  }};
+}
+
+// Select neg, abs, or both neg and abs from the i16 immediate operans.
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const {
+
+  assert(Root.isImm() && "Modifier for C must be an immediate");
+
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  switch (Root.getImm()) {
+  default: // Any other value will be silently ignored (considered as 0).
+    break;
+  case 1:
+    Mods ^= SISrcMods::NEG;
+    break;
+  case 2:
+    Mods ^= SISrcMods::ABS;
+    break;
+  case 3:
+    Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
+    break;
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
     MachineOperand &Root) const {
@@ -5149,6 +5213,35 @@ AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
+  Register Src =
+      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+  unsigned Key = 0;
+
+  Register S32 = matchZeroExtendFromS32(*MRI, Src);
+  if (!S32)
+    S32 = matchAnyExtendFromS32(Src);
+
+  if (S32) {
+    const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
+    if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
+      assert(Def->getNumOperands() == 3);
+      Register DstReg1 = Def->getOperand(1).getReg();
+      if (mi_match(S32, *MRI,
+                   m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
+        Src = Def->getOperand(2).getReg();
+        Key = 1;
+      }
+    }
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
   Register Src;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 8e9e573147a86..2cb7904d27ccc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -201,6 +201,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
 
   InstructionSelector::ComplexRendererFns
   selectVOP3PModsNeg(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3PModsNegs(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3PModsNegAbs(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
@@ -217,6 +221,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   selectSWMMACIndex8(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectSWMMACIndex16(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectSWMMACIndex32(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods(MachineOperand &Root) const;
@@ -411,6 +417,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   // shift amount operand's `ShAmtBits` bits is unneeded.
   bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
 
+  /// Match an any extend from a 32-bit value to 64-bit.
+  Register matchAnyExtendFromS32(Register Reg) const;
+
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
   const AMDGPURegisterBankInfo &RBI;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index aa678df675fb6..e7bf88d2ee5b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2932,14 +2932,22 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
 
-  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
-    .addDef(PCReg);
+  if (ST.has64BitLiterals()) {
+    assert(GAFlags != SIInstrInfo::MO_NONE);
 
-  MIB.addGlobalAddress(GV, Offset, GAFlags);
-  if (GAFlags == SIInstrInfo::MO_NONE)
-    MIB.addImm(0);
-  else
-    MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
+    MachineInstrBuilder MIB =
+        B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
+    MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
+  } else {
+    MachineInstrBuilder MIB =
+        B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
+
+    MIB.addGlobalAddress(GV, Offset, GAFlags);
+    if (GAFlags == SIInstrInfo::MO_NONE)
+      MIB.addImm(0);
+    else
+      MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
+  }
 
   if (!B.getMRI()->getRegClassOrNull(PCReg))
     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
@@ -2955,6 +2963,15 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress(
     MachineRegisterInfo &MRI) const {
   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
 
+  if (RequiresHighHalf && ST.has64BitLiterals()) {
+    if (!MRI.getRegClassOrNull(DstReg))
+      MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+    B.buildInstr(AMDGPU::S_MOV_B64)
+        .addDef(DstReg)
+        .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
+    return;
+  }
+
   LLT S32 = LLT::scalar(32);
 
   // Use the destination directly, if and only if we store the lower address
@@ -7622,6 +7639,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
   case Intrinsic::amdgcn_image_bvh8_intersect_ray:
     return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
+    Register Index = MI.getOperand(5).getReg();
+    LLT S64 = LLT::scalar(64);
+    if (MRI.getType(Index) != S64)
+      MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0));
+    return true;
+  }
   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
@@ -7636,15 +7667,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
       MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
     return true;
   }
+  case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+  case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+  case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
     Register Index = MI.getOperand(7).getReg();
-    LLT S32 = LLT::scalar(32);
-    if (MRI.getType(Index) != S32)
-      MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+    LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
+                    ? LLT::scalar(64)
+                    : LLT::scalar(32);
+    if (MRI.getType(Index) != IdxTy)
+      MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0));
     return true;
   }
+
   case Intrinsic::amdgcn_fmed3: {
     GISelChangeObserver &Observer = Helper.Observer;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2dec16de940d1..c84a0f6e31384 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -50,6 +50,7 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) {
   default:
     return AMDGPUMCExpr::S_None;
   case SIInstrInfo::MO_GOTPCREL:
+  case SIInstrInfo::MO_GOTPCREL64:
     return AMDGPUMCExpr::S_GOTPCREL;
   case SIInstrInfo::MO_GOTPCREL32_LO:
     return AMDGPUMCExpr::S_GOTPCREL32_LO;
@@ -59,10 +60,14 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) {
     return AMDGPUMCExpr::S_REL32_LO;
   case SIInstrInfo::MO_REL32_HI:
     return AMDGPUMCExpr::S_REL32_HI;
+  case SIInstrInfo::MO_REL64:
+    return AMDGPUMCExpr::S_REL64;
   case SIInstrInfo::MO_ABS32_LO:
     return AMDGPUMCExpr::S_ABS32_LO;
   case SIInstrInfo::MO_ABS32_HI:
     return AMDGPUMCExpr::S_ABS32_HI;
+  case SIInstrInfo::MO_ABS64:
+    return AMDGPUMCExpr::S_ABS64;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 250547acb1ee7..b6c6d927d0e89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -114,6 +114,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse
 MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
 MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
 MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
+MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
 MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
 MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
 MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
new file mode 100644
index 0000000000000..3b06e9b00ac69
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp
@@ -0,0 +1,108 @@
+//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Make simple transformations to relax register constraints for cases which can
+// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into
+// AGPR or VGPR with a pseudo with an AV_* class register constraint. This
+// allows later passes to inflate the register class if necessary. The register
+// allocator does not know to replace instructions to relax constraints.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUPrepareAGPRAlloc.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc"
+
+namespace {
+
+class AMDGPUPrepareAGPRAllocImpl {
+private:
+  const SIInstrInfo &TII;
+  MachineRegisterInfo &MRI;
+
+public:
+  AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
+      : TII(*ST.getInstrInfo()), MRI(MRI) {}
+  bool run(MachineFunction &MF);
+};
+
+class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) {
+    initializeAMDGPUPrepareAGPRAllocLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+                      "AMDGPU Prepare AGPR Alloc", false, false)
+INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE,
+                    "AMDGPU Prepare AGPR Alloc", false, false)
+
+char AMDGPUPrepareAGPRAllocLegacy::ID = 0;
+
+char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID;
+
+bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+}
+
+PreservedAnalyses
+AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF,
+                                MachineFunctionAnalysisManager &MFAM) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF);
+  return PreservedAnalyses::all();
+}
+
+bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) {
+  if (MRI.isReserved(AMDGPU::AGPR0))
+    return false;
+
+  const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO);
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+           TII.isInlineConstant(MI, 1)) ||
+          (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           MI.getOperand(1).isImm())) {
+        MI.setDesc(AVImmPseudo);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
new file mode 100644
index 0000000000000..dc598c98f241b
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h
@@ -0,0 +1,23 @@
+//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class AMDGPUPrepareAGPRAllocPass
+    : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7a2a7fc250e27..f5e14c71b02d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -88,7 +88,7 @@ void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers(
   // are %p and %s, which use to know if we
   // are either storing a literal string or a
   // pointer to the printf buffer.
-  static const char ConvSpecifiers[] = "cdieEfgGaosuxXp";
+  static const char ConvSpecifiers[] = "cdieEfFgGaAosuxXp";
   size_t CurFmtSpecifierIdx = 0;
   size_t PrevFmtSpecifierIdx = 0;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 6a59a28b1d32c..411159c8aa33d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/Support/AMDGPUAddrSpace.h"
 
 #define DEBUG_TYPE "amdgpu-regbanklegalize"
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 965053ffe8624..cbbb57c6f8122 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4689,6 +4689,44 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
     case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x4_f32:
+    case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16:
+    case Intrinsic::amdgcn_wmma_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_wmma_f16_16x16x32_f16:
+    case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16:
+    case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16:
+    case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8:
+    case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8:
+    case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+    case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+    case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 46027b8890234..8101c68986241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
 
   Info.UsesVCC =
       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+  Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
+                                                /*IncludeCalls=*/false);
+  if (ST.hasMAIInsts())
+    Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
+                                          /*IncludeCalls=*/false);
 
   // If there are no calls, MachineRegisterInfo can tell us the used register
   // count easily.
   // A tail call isn't considered a call for MachineFrameInfo's purposes.
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
-    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-    if (ST.hasMAIInsts())
-      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
+                                          /*IncludeCalls=*/false);
     return Info;
   }
 
   int32_t MaxVGPR = -1;
-  int32_t MaxAGPR = -1;
-  int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
-      // TODO: Check regmasks? Do they occur anywhere except calls?
-      for (const MachineOperand &MO : MI.operands()) {
-        unsigned Width = 0;
-        bool IsSGPR = false;
-        bool IsAGPR = false;
+      for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+        const MachineOperand &MO = MI.getOperand(I);
 
         if (!MO.isReg())
           continue;
 
         Register Reg = MO.getReg();
         switch (Reg) {
-        case AMDGPU::EXEC:
-        case AMDGPU::EXEC_LO:
-        case AMDGPU::EXEC_HI:
-        case AMDGPU::SCC:
-        case AMDGPU::M0:
-        case AMDGPU::M0_LO16:
-        case AMDGPU::M0_HI16:
-        case AMDGPU::SRC_SHARED_BASE_LO:
-        case AMDGPU::SRC_SHARED_BASE:
-        case AMDGPU::SRC_SHARED_LIMIT_LO:
-        case AMDGPU::SRC_SHARED_LIMIT:
-        case AMDGPU::SRC_PRIVATE_BASE_LO:
-        case AMDGPU::SRC_PRIVATE_BASE:
-        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
-        case AMDGPU::SRC_PRIVATE_LIMIT:
-        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
-        case AMDGPU::SGPR_NULL:
-        case AMDGPU::SGPR_NULL64:
-        case AMDGPU::MODE:
-          continue;
-
         case AMDGPU::NoRegister:
           assert(MI.isDebugInstr() &&
                  "Instruction uses invalid noreg register");
           continue;
 
-        case AMDGPU::VCC:
-        case AMDGPU::VCC_LO:
-        case AMDGPU::VCC_HI:
-        case AMDGPU::VCC_LO_LO16:
-        case AMDGPU::VCC_LO_HI16:
-        case AMDGPU::VCC_HI_LO16:
-        case AMDGPU::VCC_HI_HI16:
-          Info.UsesVCC = true;
-          continue;
-
-        case AMDGPU::FLAT_SCR:
-        case AMDGPU::FLAT_SCR_LO:
-        case AMDGPU::FLAT_SCR_HI:
-          continue;
-
         case AMDGPU::XNACK_MASK:
         case AMDGPU::XNACK_MASK_LO:
         case AMDGPU::XNACK_MASK_HI:
@@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
           break;
         }
 
-        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
-            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
-            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 1;
-        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 1;
-        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 2;
-        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 3;
-        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 3;
-        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 3;
-        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 4;
-        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 5;
-        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 5;
-        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 5;
-        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 6;
-        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 6;
-        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 6;
-        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 7;
-        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 7;
-        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 7;
-        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 8;
-        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 9;
-        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 9;
-        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 9;
-        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 10;
-        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 10;
-        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 10;
-        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 11;
-        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 11;
-        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 11;
-        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 12;
-        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 12;
-        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 12;
-        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 16;
-        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 16;
-        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 16;
-        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 32;
-        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 32;
-        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 32;
-        } else {
-          // We only expect TTMP registers or registers that do not belong to
-          // any RC.
-          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
-                  !TRI.getPhysRegBaseClass(Reg)) &&
-                 "Unknown register class");
-        }
+        const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
+        assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
+                TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
+                AMDGPU::TTMP_64RegClass.contains(Reg) ||
+                AMDGPU::TTMP_128RegClass.contains(Reg) ||
+                AMDGPU::TTMP_256RegClass.contains(Reg) ||
+                AMDGPU::TTMP_512RegClass.contains(Reg)) &&
+               "Unknown register class");
+
+        if (!RC || !TRI.isVGPRClass(RC))
+          continue;
+
+        unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
         unsigned HWReg = TRI.getHWRegIndex(Reg);
         int MaxUsed = HWReg + Width - 1;
-        if (IsSGPR) {
-          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
-        } else if (IsAGPR) {
-          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
-        } else {
-          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
-        }
+        MaxVGPR = std::max(MaxUsed, MaxVGPR);
       }
 
       if (MI.isCall()) {
@@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
     }
   }
 
-  Info.NumExplicitSGPR = MaxSGPR + 1;
   Info.NumVGPR = MaxVGPR + 1;
-  Info.NumAGPR = MaxAGPR + 1;
 
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 1f6002a3c6a20..dfe0cbf18c476 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -341,6 +341,10 @@ foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
 def : SourceOfDivergence<intr>;
 foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
 def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUWMMAIntrinsicsGFX1250 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUSWMMACIntrinsicsGFX1250 in
+def : SourceOfDivergence<intr>;
 
 def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
 def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f4dc4a483181c..c865082a1dcea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -25,6 +25,7 @@
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUPreloadKernArgProlog.h"
+#include "AMDGPUPrepareAGPRAlloc.h"
 #include "AMDGPURemoveIncompatibleFunctions.h"
 #include "AMDGPUReserveWWMRegs.h"
 #include "AMDGPUResourceUsageAnalysis.h"
@@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGlobalISel(*PR);
   initializeAMDGPUAsmPrinterPass(*PR);
   initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
+  initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
   initializeGCNDPPCombineLegacyPass(*PR);
   initializeSILowerI1CopiesLegacyPass(*PR);
   initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
@@ -1196,6 +1198,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
+  void addPreRegAlloc() override;
   void addFastRegAlloc() override;
   void addOptimizedRegAlloc() override;
 
@@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() {
   TargetPassConfig::addFastRegAlloc();
 }
 
+void GCNPassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(&AMDGPUPrepareAGPRAllocLegacyID);
+}
+
 void GCNPassConfig::addOptimizedRegAlloc() {
   if (EnableDCEInRA)
     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
@@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   Base::addOptimizedRegAlloc(addPass);
 }
 
+void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const {
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(AMDGPUPrepareAGPRAllocPass());
+}
+
 Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
     AddMachinePass &addPass) const {
   // TODO: Check --regalloc-npm option
@@ -2284,6 +2297,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
   Base::addPostRegAlloc(addPass);
 }
 
+void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const {
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
+    addPass(SIShrinkInstructionsPass());
+  addPass(SIPostRABundlerPass());
+}
+
 void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
   if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
     addPass(GCNCreateVOPDPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 3c62cd19c6e57..e0f1296ddded8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -181,8 +181,11 @@ class AMDGPUCodeGenPassBuilder
   void addMachineSSAOptimization(AddMachinePass &) const;
   void addPostRegAlloc(AddMachinePass &) const;
   void addPreEmitPass(AddMachinePass &) const;
+  void addPreEmitRegAlloc(AddMachinePass &) const;
   Error addRegAssignmentOptimized(AddMachinePass &) const;
+  void addPreRegAlloc(AddMachinePass &) const;
   void addOptimizedRegAlloc(AddMachinePass &) const;
+  void addPreSched2(AddMachinePass &) const;
 
   /// Check if a pass is enabled given \p Opt option. The option always
   /// overrides defaults if explicitly used. Otherwise its default will be used
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6439230b8769f..43d4e8db791b0 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -157,6 +157,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyNegHi,
     ImmTyIndexKey8bit,
     ImmTyIndexKey16bit,
+    ImmTyIndexKey32bit,
     ImmTyDPP8,
     ImmTyDppCtrl,
     ImmTyDppRowMask,
@@ -174,8 +175,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyWaitEXP,
     ImmTyWaitVAVDst,
     ImmTyWaitVMVSrc,
-    ImmTyByteSel,
     ImmTyBitOp3,
+    ImmTyMatrixAReuse,
+    ImmTyMatrixBReuse,
+    ImmTyByteSel,
   };
 
   // Immediate operand kind.
@@ -419,6 +422,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   bool isCPol() const { return isImmTy(ImmTyCPol); }
   bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
   bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
+  bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); }
+  bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); }
+  bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
   bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
   bool isDppFI() const { return isImmTy(ImmTyDppFI); }
@@ -747,6 +753,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64);
   }
 
+  bool isVISrc_512_f64() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f64);
+  }
+
   bool isVISrc_128B16() const {
     return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16);
   }
@@ -1116,6 +1126,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     case ImmTyCPol: OS << "CPol"; break;
     case ImmTyIndexKey8bit: OS << "index_key"; break;
     case ImmTyIndexKey16bit: OS << "index_key"; break;
+    case ImmTyIndexKey32bit: OS << "index_key"; break;
     case ImmTyTFE: OS << "TFE"; break;
     case ImmTyD16: OS << "D16"; break;
     case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1162,8 +1173,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     case ImmTyWaitEXP: OS << "WaitEXP"; break;
     case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
     case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
-    case ImmTyByteSel: OS << "ByteSel" ; break;
     case ImmTyBitOp3: OS << "BitOp3"; break;
+    case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
+    case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
+    case ImmTyByteSel: OS << "ByteSel" ; break;
     }
     // clang-format on
   }
@@ -1700,6 +1713,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                                AMDGPUOperand::ImmTy ImmTy);
   ParseStatus parseIndexKey8bit(OperandVector &Operands);
   ParseStatus parseIndexKey16bit(OperandVector &Operands);
+  ParseStatus parseIndexKey32bit(OperandVector &Operands);
 
   ParseStatus parseDfmtNfmt(int64_t &Format);
   ParseStatus parseUfmt(int64_t &Format);
@@ -3981,8 +3995,8 @@ bool AMDGPUAsmParser::validateVOPD(const MCInst &Inst,
   bool AsVOPD3 = MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3;
 
   if (AsVOPD3) {
-    for (unsigned I = 0, E = Operands.size(); I != E; ++I) {
-      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    for (const std::unique_ptr<MCParsedAsmOperand> &Operand : Operands) {
+      AMDGPUOperand &Op = (AMDGPUOperand &)*Operand;
       if ((Op.isRegKind() || Op.isImmTy(AMDGPUOperand::ImmTyNone)) &&
           (Op.getModifiers().getFPModifiersOperand() & SISrcMods::ABS))
         Error(Op.getStartLoc(), "ABS not allowed in VOPD3 instructions");
@@ -7153,7 +7167,9 @@ ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
   if (!Res.isSuccess())
     return Res;
 
-  if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
+  if ((ImmTy == AMDGPUOperand::ImmTyIndexKey16bit ||
+       ImmTy == AMDGPUOperand::ImmTyIndexKey32bit) &&
+      (ImmVal < 0 || ImmVal > 1))
     return Error(Loc, Twine("out of range ", StringRef(Pref)));
 
   if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
@@ -7171,6 +7187,10 @@ ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
   return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
 }
 
+ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) {
+  return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit);
+}
+
 // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
 // values to live in a joint format operand in the MCInst encoding.
 ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -9272,6 +9292,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
                           DefaultVal);
   }
 
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse))
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyMatrixAReuse, 0);
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_b_reuse))
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyMatrixBReuse, 0);
+
   int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
   if (NegLoIdx != -1)
     addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
@@ -9378,6 +9406,10 @@ void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
     addOptionalImmOperand(Inst, Operands, OptIdx,
                           AMDGPUOperand::ImmTyIndexKey16bit);
 
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_32bit))
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyIndexKey32bit);
+
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
     addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClamp);
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e3519f192137c..42edec0d01493 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
+  AMDGPUPrepareAGPRAlloc.cpp
   AMDGPUSwLowerLDS.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 3625db9a4791f..c8a4e22ed1dae 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   let Inst{95-72} = !if(ps.has_offset, offset, ?);
 }
 
+// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode.
 class GlobalSaddrTable <bit is_saddr, string Name = ""> {
   bit IsSaddr = is_saddr;
   string SaddrOp = Name;
@@ -237,10 +238,18 @@ class FLAT_Load_Pseudo<
   let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
-multiclass FLAT_Load_Pseudo_t16<string opName> {
-  def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
+multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
+  def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>,
+    GlobalSaddrTable<0, opName>;
+  let OtherPredicates = [HasFlatGVSMode] in
+  def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+    GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Load_Pseudo_t16<string opName> {
+  defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>;
   let True16Predicate = UseRealTrue16Insts in
-    def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+    defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
 }
 
 class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
@@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   let enabled_saddr = EnableSaddr;
 }
 
-multiclass FLAT_Store_Pseudo_t16<string opName> {
-  def "" : FLAT_Store_Pseudo<opName, VGPR_32>;
-  let OtherPredicates = [HasTrue16BitInsts] in
-    def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> {
+  def "" : FLAT_Store_Pseudo<opName, regClass>,
+    GlobalSaddrTable<0, opName>;
+  let OtherPredicates = [HasFlatGVSMode] in
+  def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+    GlobalSaddrTable<1, opName>;
+}
+
+multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
+  defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>;
+
+  defvar Name16 = opName#"_t16";
+  let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in {
+    def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>,
+      GlobalSaddrTable<0, Name16>,
+      True16D16Table<NAME#"_D16_HI", NAME>;
+	def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>,
+      GlobalSaddrTable<1, Name16>,
+      True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">;
+  }
 }
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
@@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
     let FPAtomic = data_vt.isFP;
     let AddedComplexity = -1; // Prefer global atomics if available
   }
+
+  def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+    (outs),
+    (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+    " $vaddr, $vdata, $saddr$offset$cpol">,
+    GlobalSaddrTable<1, opName> {
+    let OtherPredicates = [HasFlatGVSMode];
+    let has_saddr = 1;
+    let enabled_saddr = 1;
+    let FPAtomic = data_vt.isFP;
+    let AddedComplexity = -1; // Prefer global atomics if available
+  }
 }
 
 multiclass FLAT_Atomic_Pseudo_RTN<
@@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN<
   ValueType vt,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc,
-  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+  RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
+  RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
+    (outs vdst_op:$vdst),
     (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
     " $vdst, $vaddr, $vdata$offset$cpol">,
     GlobalSaddrTable<0, opName#"_rtn"> {
     let FPAtomic = data_vt.isFP;
     let AddedComplexity = -1; // Prefer global atomics if available
   }
+
+  def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+    (outs vdst_op:$vdst),
+      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+    " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+    GlobalSaddrTable<1, opName#"_rtn"> {
+    let OtherPredicates = [HasFlatGVSMode];
+    let has_saddr = 1;
+    let enabled_saddr = 1;
+    let PseudoInstr = NAME#"_SADDR_RTN";
+    let FPAtomic = data_vt.isFP;
+    let AddedComplexity = -1; // Prefer global atomics if available
+  }
 }
 
 multiclass FLAT_Atomic_Pseudo<
@@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo<
 // Flat Instructions
 //===----------------------------------------------------------------------===//
 
-def FLAT_LOAD_UBYTE    : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE    : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT   : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT   : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD    : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2  : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4  : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3  : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+defm FLAT_LOAD_UBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+defm FLAT_LOAD_SBYTE    : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+defm FLAT_LOAD_USHORT   : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+defm FLAT_LOAD_SSHORT   : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+defm FLAT_LOAD_DWORD    : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>;
+defm FLAT_LOAD_DWORDX2  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+defm FLAT_LOAD_DWORDX4  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+defm FLAT_LOAD_DWORDX3  : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
 
-def FLAT_STORE_DWORD   : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+defm FLAT_STORE_DWORD   : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>;
+defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
 
 let SubtargetPredicate = HasD16LoadStore in {
 let TiedSourceNotRead = 1 in {
-def FLAT_LOAD_UBYTE_D16_HI  : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_UBYTE_D16    : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
-def FLAT_LOAD_SBYTE_D16_HI  : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SBYTE_D16    : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
-def FLAT_LOAD_SHORT_D16_HI  : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SHORT_D16    : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
+defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
+defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
+defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16    : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">;
 }
 
-def FLAT_STORE_BYTE_D16_HI  : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
-def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
+defm FLAT_STORE_BYTE_D16_HI  : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
+defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
 }
 
-defm FLAT_STORE_BYTE   : FLAT_Store_Pseudo_t16 <"flat_store_byte">;
-defm FLAT_STORE_SHORT  : FLAT_Store_Pseudo_t16 <"flat_store_short">;
+defm FLAT_STORE_BYTE   : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">;
+defm FLAT_STORE_SHORT  : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">;
 
 defm FLAT_ATOMIC_CMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
                                 VGPR_32, i32, v2i32, VReg_64>;
@@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
   (inst $saddr, $voffset, $offset, 0, $in)
 >;
 
+class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
+  (inst $saddr, $voffset, $offset, (i32 0), $in)
+>;
+
+class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
+  (inst $saddr, $voffset, $offset, (i32 0))
+>;
+
 class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
   (inst $saddr, $voffset, $offset, (i32 0))
@@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
   (inst $vaddr, $offset)
 >;
 
-class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
   (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
   (inst $saddr, $voffset, $offset, 0)
 >;
 
-class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
-                           ValueType vt> : GCNPat <
+class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                         ValueType vt> : GCNPat <
   (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
   (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
 >;
@@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
     let AddedComplexity = 10;
   }
 
-  def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu
     let AddedComplexity = 10;
   }
 
-  def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
     let AddedComplexity = 10;
   }
 
-  def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu
     let AddedComplexity = 10;
   }
 
-  def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> {
     let AddedComplexity = 11;
   }
 }
@@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
   }
 }
 
+multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadPat <inst, node, vt>;
+
+  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadPat_D16 <inst, node, vt>;
+
+  def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadPat_D16_t16 <inst, node, vt>;
+
+  def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatStorePat <inst, node, vt>;
+
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
+multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>;
+
+  def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> {
+    let AddedComplexity = 9;
+    let SubtargetPredicate = HasFlatGVSMode;
+  }
+}
+
 let OtherPredicates = [HasFlatAddressSpace] in {
 
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
 
 foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
 let True16Predicate = p in {
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
-  def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+  defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+  defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
 }
 
 let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
-  def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+  defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+  defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+  defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
   def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
   def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
 } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
 
-def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>;
 
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
 
 foreach vt = Reg32Types.types in {
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>;
 }
 
 foreach vt = VReg_64.RegTypes in {
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>;
 }
 
-def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>;
 
 foreach vt = VReg_128.RegTypes in {
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>;
+defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>;
 }
 
-def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
-def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+
 
 foreach as = [ "flat", "global" ] in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
 
 } // end foreach as
 
+defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
+
 let SubtargetPredicate = isGFX12Plus in {
   defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
 
@@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in {
 }
 
 let OtherPredicates = [HasD16LoadStore] in {
-def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
+defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
 }
 
 let OtherPredicates = [D16PreservesUnusedBits] in {
 // TODO: Handle atomic loads
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
 
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
-def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
 }
 
 } // End OtherPredicates = [HasFlatAddressSpace]
@@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>;
 // appropriate waits.
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>;
+defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>;
 
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
@@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>;
 
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op,
   VFLAT_Aliases_gfx12<name, alias>,
   VFLAT_Real_gfx12<op, name>;
 
-multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
-                                    string name = get_FLAT_ps<NAME>.Mnemonic,
-                                    string alias = name> :
-  VFLAT_Real_Base_gfx12<op, name, alias> {
-  defm _RTN : VFLAT_Real_gfx12<op, name>;
-}
-
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op,
                                       string name = get_FLAT_ps<NAME>.Mnemonic,
                                       string alias = name> :
   VFLAT_Real_Base_gfx12<op, name, alias> {
@@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
   }
 }
 
-multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
+multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op,
                                        string name = get_FLAT_ps<NAME>.Mnemonic> :
   VFLAT_Aliases_gfx12<name> {
   let DecoderNamespace = "GFX12W64" in {
@@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
   }
 }
 
-multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
                                       string name = get_FLAT_ps<NAME>.Mnemonic,
                                       string alias = name> :
-  VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
+  VFLAT_Real_AllAddr_gfx12<op, name, alias> {
   defm _RTN : VFLAT_Real_gfx12<op, name>;
   defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
 }
@@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op,
 }
 
 // ENC_VFLAT.
-defm FLAT_LOAD_UBYTE               : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">;
-defm FLAT_LOAD_SBYTE               : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">;
-defm FLAT_LOAD_USHORT              : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">;
-defm FLAT_LOAD_SSHORT              : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">;
-defm FLAT_LOAD_DWORD               : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">;
-defm FLAT_LOAD_DWORDX2             : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">;
-defm FLAT_LOAD_DWORDX3             : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">;
-defm FLAT_LOAD_DWORDX4             : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">;
-defm FLAT_STORE_BYTE               : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">;
-defm FLAT_STORE_SHORT              : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">;
-defm FLAT_STORE_DWORD              : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">;
-defm FLAT_STORE_DWORDX2            : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">;
-defm FLAT_STORE_DWORDX3            : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">;
-defm FLAT_STORE_DWORDX4            : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">;
-defm FLAT_LOAD_UBYTE_D16           : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">;
-defm FLAT_LOAD_SBYTE_D16           : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">;
-defm FLAT_LOAD_SHORT_D16           : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">;
-defm FLAT_LOAD_UBYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">;
-defm FLAT_LOAD_SBYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">;
-defm FLAT_LOAD_SHORT_D16_HI        : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">;
-defm FLAT_STORE_BYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">;
-defm FLAT_STORE_SHORT_D16_HI       : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">;
+defm FLAT_LOAD_UBYTE               : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">;
+defm FLAT_LOAD_SBYTE               : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">;
+defm FLAT_LOAD_USHORT              : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">;
+defm FLAT_LOAD_SSHORT              : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">;
+defm FLAT_LOAD_DWORD               : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">;
+defm FLAT_LOAD_DWORDX2             : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">;
+defm FLAT_LOAD_DWORDX3             : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">;
+defm FLAT_LOAD_DWORDX4             : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">;
+defm FLAT_STORE_BYTE               : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">;
+defm FLAT_STORE_SHORT              : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">;
+defm FLAT_STORE_DWORD              : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">;
+defm FLAT_STORE_DWORDX2            : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">;
+defm FLAT_STORE_DWORDX3            : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">;
+defm FLAT_STORE_DWORDX4            : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">;
+defm FLAT_LOAD_UBYTE_D16           : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">;
+defm FLAT_LOAD_SBYTE_D16           : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">;
+defm FLAT_LOAD_SHORT_D16           : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">;
+defm FLAT_LOAD_UBYTE_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_SBYTE_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_SHORT_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">;
+defm FLAT_STORE_BYTE_D16_HI        : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">;
+defm FLAT_STORE_SHORT_D16_HI       : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">;
 defm FLAT_ATOMIC_SWAP              : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">;
 defm FLAT_ATOMIC_CMPSWAP           : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">;
 defm FLAT_ATOMIC_ADD               : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">;
@@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16        : VFLAT_Real_Atomics_gfx12<0x059>;
 defm FLAT_ATOMIC_PK_ADD_BF16       : VFLAT_Real_Atomics_gfx12<0x05a>;
 
 // ENC_VGLOBAL.
-defm GLOBAL_LOAD_UBYTE             : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">;
-defm GLOBAL_LOAD_SBYTE             : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">;
-defm GLOBAL_LOAD_USHORT            : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">;
-defm GLOBAL_LOAD_SSHORT            : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">;
-defm GLOBAL_LOAD_DWORD             : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">;
-defm GLOBAL_LOAD_DWORDX2           : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">;
-defm GLOBAL_LOAD_DWORDX3           : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">;
-defm GLOBAL_LOAD_DWORDX4           : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">;
-defm GLOBAL_STORE_BYTE             : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">;
-defm GLOBAL_STORE_SHORT            : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">;
-defm GLOBAL_STORE_DWORD            : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
-defm GLOBAL_STORE_DWORDX2          : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
-defm GLOBAL_STORE_DWORDX3          : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
-defm GLOBAL_STORE_DWORDX4          : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
-defm GLOBAL_LOAD_UBYTE_D16         : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
-defm GLOBAL_LOAD_SBYTE_D16         : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
-defm GLOBAL_LOAD_SHORT_D16         : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
-defm GLOBAL_LOAD_UBYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
-defm GLOBAL_LOAD_SBYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
-defm GLOBAL_LOAD_SHORT_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
-defm GLOBAL_STORE_BYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
-defm GLOBAL_STORE_SHORT_D16_HI     : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
-defm GLOBAL_LOAD_DWORD_ADDTID      : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
-defm GLOBAL_STORE_DWORD_ADDTID     : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
-defm GLOBAL_LOAD_BLOCK             : VGLOBAL_Real_AllAddr_gfx12<0x053>;
-defm GLOBAL_STORE_BLOCK            : VGLOBAL_Real_AllAddr_gfx12<0x054>;
-
-defm GLOBAL_ATOMIC_SWAP            : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
-defm GLOBAL_ATOMIC_CMPSWAP         : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
-defm GLOBAL_ATOMIC_ADD             : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
-defm GLOBAL_ATOMIC_SUB             : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
-defm GLOBAL_ATOMIC_CSUB            : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
-defm GLOBAL_ATOMIC_SMIN            : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
-defm GLOBAL_ATOMIC_UMIN            : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
-defm GLOBAL_ATOMIC_SMAX            : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
-defm GLOBAL_ATOMIC_UMAX            : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
-defm GLOBAL_ATOMIC_AND             : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
-defm GLOBAL_ATOMIC_OR              : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
-defm GLOBAL_ATOMIC_XOR             : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
-defm GLOBAL_ATOMIC_INC             : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
-defm GLOBAL_ATOMIC_DEC             : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
-defm GLOBAL_ATOMIC_SWAP_X2         : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
-defm GLOBAL_ATOMIC_CMPSWAP_X2      : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
-defm GLOBAL_ATOMIC_ADD_X2          : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
-defm GLOBAL_ATOMIC_SUB_X2          : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
-defm GLOBAL_ATOMIC_SMIN_X2         : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
-defm GLOBAL_ATOMIC_UMIN_X2         : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
-defm GLOBAL_ATOMIC_SMAX_X2         : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
-defm GLOBAL_ATOMIC_UMAX_X2         : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
-defm GLOBAL_ATOMIC_AND_X2          : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
-defm GLOBAL_ATOMIC_OR_X2           : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
-defm GLOBAL_ATOMIC_XOR_X2          : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
-defm GLOBAL_ATOMIC_INC_X2          : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
-defm GLOBAL_ATOMIC_DEC_X2          : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
-defm GLOBAL_ATOMIC_COND_SUB_U32    : VGLOBAL_Real_Atomics_gfx12<0x050>;
-defm GLOBAL_ATOMIC_FMIN            : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
-defm GLOBAL_ATOMIC_FMAX            : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
-defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056>;
+defm GLOBAL_LOAD_UBYTE             : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">;
+defm GLOBAL_LOAD_SBYTE             : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">;
+defm GLOBAL_LOAD_USHORT            : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">;
+defm GLOBAL_LOAD_SSHORT            : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">;
+defm GLOBAL_LOAD_DWORD             : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">;
+defm GLOBAL_LOAD_DWORDX2           : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">;
+defm GLOBAL_LOAD_DWORDX3           : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">;
+defm GLOBAL_LOAD_DWORDX4           : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">;
+defm GLOBAL_STORE_BYTE             : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">;
+defm GLOBAL_STORE_SHORT            : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">;
+defm GLOBAL_STORE_DWORD            : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
+defm GLOBAL_STORE_DWORDX2          : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
+defm GLOBAL_STORE_DWORDX3          : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
+defm GLOBAL_STORE_DWORDX4          : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
+defm GLOBAL_LOAD_UBYTE_D16         : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
+defm GLOBAL_LOAD_SBYTE_D16         : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
+defm GLOBAL_LOAD_SHORT_D16         : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
+defm GLOBAL_LOAD_UBYTE_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_SBYTE_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_SHORT_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_BYTE_D16_HI      : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_SHORT_D16_HI     : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_DWORD_ADDTID      : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
+defm GLOBAL_STORE_DWORD_ADDTID     : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+defm GLOBAL_LOAD_BLOCK             : VFLAT_Real_AllAddr_gfx12<0x053>;
+defm GLOBAL_STORE_BLOCK            : VFLAT_Real_AllAddr_gfx12<0x054>;
+
+defm GLOBAL_ATOMIC_SWAP            : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
+defm GLOBAL_ATOMIC_CMPSWAP         : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
+defm GLOBAL_ATOMIC_ADD             : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
+defm GLOBAL_ATOMIC_SUB             : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
+defm GLOBAL_ATOMIC_CSUB            : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_SMIN            : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
+defm GLOBAL_ATOMIC_UMIN            : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
+defm GLOBAL_ATOMIC_SMAX            : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
+defm GLOBAL_ATOMIC_UMAX            : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
+defm GLOBAL_ATOMIC_AND             : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
+defm GLOBAL_ATOMIC_OR              : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
+defm GLOBAL_ATOMIC_XOR             : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
+defm GLOBAL_ATOMIC_INC             : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
+defm GLOBAL_ATOMIC_DEC             : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
+defm GLOBAL_ATOMIC_SWAP_X2         : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
+defm GLOBAL_ATOMIC_CMPSWAP_X2      : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
+defm GLOBAL_ATOMIC_ADD_X2          : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
+defm GLOBAL_ATOMIC_SUB_X2          : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
+defm GLOBAL_ATOMIC_SMIN_X2         : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
+defm GLOBAL_ATOMIC_UMIN_X2         : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
+defm GLOBAL_ATOMIC_SMAX_X2         : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
+defm GLOBAL_ATOMIC_UMAX_X2         : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
+defm GLOBAL_ATOMIC_AND_X2          : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
+defm GLOBAL_ATOMIC_OR_X2           : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
+defm GLOBAL_ATOMIC_XOR_X2          : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
+defm GLOBAL_ATOMIC_INC_X2          : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
+defm GLOBAL_ATOMIC_DEC_X2          : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
+defm GLOBAL_ATOMIC_COND_SUB_U32    : VFLAT_Real_Atomics_gfx12<0x050>;
+defm GLOBAL_ATOMIC_FMIN            : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_FMAX            : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32         : VFLAT_Real_Atomics_gfx12<0x056>;
 
 defm GLOBAL_LOAD_TR_B128_w32       : VGLOBAL_Real_AllAddr_gfx1200<0x057>;
 defm GLOBAL_LOAD_TR_B64_w32        : VGLOBAL_Real_AllAddr_gfx1200<0x058>;
 
-defm GLOBAL_LOAD_TR_B128_w64       : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>;
-defm GLOBAL_LOAD_TR_B64_w64        : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>;
+defm GLOBAL_LOAD_TR_B128_w64       : VFLAT_Real_AllAddr_gfx12_w64<0x057>;
+defm GLOBAL_LOAD_TR_B64_w64        : VFLAT_Real_AllAddr_gfx12_w64<0x058>;
 
-defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
-defm GLOBAL_ATOMIC_PK_ADD_F16      : VGLOBAL_Real_Atomics_gfx12<0x059>;
-defm GLOBAL_ATOMIC_PK_ADD_BF16     : VGLOBAL_Real_Atomics_gfx12<0x05a>;
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>;
+defm GLOBAL_ATOMIC_PK_ADD_F16      : VFLAT_Real_Atomics_gfx12<0x059>;
+defm GLOBAL_ATOMIC_PK_ADD_BF16     : VFLAT_Real_Atomics_gfx12<0x05a>;
 
 defm GLOBAL_INV                    : VFLAT_Real_Base_gfx12<0x02b>;
 defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c>;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0976fccf78d86..bbed828b4fed3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   }
   fixVALUPartialForwardingHazard(MI);
   fixVALUTransUseHazard(MI);
+  fixVALUTransCoexecutionHazards(MI);
   fixWMMAHazards(MI);
   fixShift64HighRegBug(MI);
   fixVALUMaskWriteHazard(MI);
@@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
   return true;
 }
 
+bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
+  if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
+      !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {
+    if (!SIInstrInfo::isTRANS(I))
+      return false;
+
+    // RAW: Trans(I) writes, VALU(MI) reads.
+    Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
+    for (const MachineOperand &ValuUse : MI->explicit_uses()) {
+      if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
+        return true;
+    }
+
+    auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);
+    if (!ValuDst || !ValuDst->isReg())
+      return false;
+
+    // WAR: Trans(I) reads, VALU(MI) writes.
+    Register ValuDef = ValuDst->getReg();
+    for (const MachineOperand &TransUse : I.explicit_uses()) {
+      if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
+        return true;
+    }
+
+    return false;
+  };
+
+  auto IsExpiredFn = [](const MachineInstr &I, int) {
+    return SIInstrInfo::isVALU(I);
+  };
+
+  const int HasVALU = std::numeric_limits<int>::max();
+  if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)
+    return false;
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
+  return true;
+}
+
 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
   if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index bbc55851bf967..ef6ddd874f58a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -104,6 +104,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixLdsDirectVMEMHazard(MachineInstr *MI);
   bool fixVALUPartialForwardingHazard(MachineInstr *MI);
   bool fixVALUTransUseHazard(MachineInstr *MI);
+  bool fixVALUTransCoexecutionHazards(MachineInstr *MI);
   bool fixWMMAHazards(MachineInstr *MI);
   bool fixShift64HighRegBug(MachineInstr *MI);
   bool fixVALUMaskWriteHazard(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fce8f36d45969..a6553083d722b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() {
 GCNRegPressure
 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
   GCNDownwardRPTracker RPTracker(*LIS);
-  RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+  RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
+                    &LiveIns[RegionIdx]);
   return RPTracker.moveMaxPressure();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0ad4778875cd3..268162bcada47 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -214,6 +214,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool FlatInstOffsets = false;
   bool FlatGlobalInsts = false;
   bool FlatScratchInsts = false;
+  bool FlatGVSMode = false;
   bool ScalarFlatScratchInsts = false;
   bool HasArchitectedFlatScratch = false;
   bool EnableFlatScratch = false;
@@ -233,6 +234,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasRestrictedSOffset = false;
   bool Has64BitLiterals = false;
   bool HasBitOp3Insts = false;
+  bool HasTanhInsts = false;
   bool HasTransposeLoadF4F6Insts = false;
   bool HasPrngInst = false;
   bool HasBVHDualAndBVH8Insts = false;
@@ -1160,6 +1162,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
 
+  bool hasFlatGVSMode() const { return FlatGVSMode; }
+
   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }
@@ -1377,6 +1381,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasMinimum3Maximum3F16;
   }
 
+  bool hasTanhInsts() const { return HasTanhInsts; }
+
+  bool hasAddPC64Inst() const { return GFX1250Insts; }
+
   bool hasMinimum3Maximum3PKF16() const {
     return HasMinimum3Maximum3PKF16;
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index e7d0e1838fa63..2a920f6feb1c9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -108,7 +108,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
                                  MCContext *Ctx) {
   int64_t SignedValue = static_cast<int64_t>(Value);
 
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   case AMDGPU::fixup_si_sopp_br: {
     int64_t BrImm = (SignedValue - 4) / 4;
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 22ae5f4e71915..0d5a8be6220db 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -64,6 +64,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup,
     return ELF::R_AMDGPU_ABS32_LO;
   case AMDGPUMCExpr::S_ABS32_HI:
     return ELF::R_AMDGPU_ABS32_HI;
+  case AMDGPUMCExpr::S_ABS64:
+    return ELF::R_AMDGPU_ABS64;
   }
 
   MCFixupKind Kind = Fixup.getKind();
@@ -76,7 +78,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup,
     return IsPCRel ? ELF::R_AMDGPU_REL64 : ELF::R_AMDGPU_ABS64;
   }
 
-  if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) {
+  if (Fixup.getKind() == AMDGPU::fixup_si_sopp_br) {
     const auto *SymA = Target.getAddSym();
     assert(SymA);
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index cb6319ed627ca..ec9248b972ec4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1332,6 +1332,16 @@ void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
   O << " index_key:" << Imm;
 }
 
+void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+  if (Imm == 0)
+    return;
+
+  O << " index_key:" << Imm;
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index fb803b1f81342..e3299a618e882 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -132,6 +132,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printIndexKey32bit(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 31dd373e54fb6..ffdac8b8ce324 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -25,6 +25,7 @@ const MCAsmInfo::AtSpecifier atSpecifiers[] = {
     {AMDGPUMCExpr::S_REL64, "rel64"},
     {AMDGPUMCExpr::S_ABS32_LO, "abs32@lo"},
     {AMDGPUMCExpr::S_ABS32_HI, "abs32@hi"},
+    {AMDGPUMCExpr::S_ABS64, "abs64"},
 };
 
 AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 4bb3942936f04..f48739fe01814 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -381,9 +381,11 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
 
   // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
   // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel.
-  if ((Desc.TSFlags & SIInstrFlags::VOP3P) ||
-      Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
-      Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) {
+  if (((Desc.TSFlags & SIInstrFlags::VOP3P) ||
+       Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
+       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
+      // Matrix B reuse operand reuses op_sel_hi.
+      !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
     Encoding |= getImplicitOpSelHiEncoding(Opcode);
   }
 
@@ -562,7 +564,8 @@ static bool needsPCRel(const MCExpr *Expr) {
   case MCExpr::SymbolRef: {
     auto *SE = cast<MCSymbolRefExpr>(Expr);
     auto Spec = AMDGPU::getSpecifier(SE);
-    return Spec != AMDGPUMCExpr::S_ABS32_LO && Spec != AMDGPUMCExpr::S_ABS32_HI;
+    return Spec != AMDGPUMCExpr::S_ABS32_LO &&
+           Spec != AMDGPUMCExpr::S_ABS32_HI && Spec != AMDGPUMCExpr::S_ABS64;
   }
   case MCExpr::Binary: {
     auto *BE = cast<MCBinaryExpr>(Expr);
@@ -685,7 +688,12 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
     const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
     uint32_t Offset = Desc.getSize();
     assert(Offset == 4 || Offset == 8);
-    addFixup(Fixups, Offset, MO.getExpr(), FK_Data_4, PCRel);
+    auto OpType = Desc.operands()[OpNo].OperandType;
+    MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
+                        OpType == AMDGPU::OPERAND_REG_IMM_INT64)
+                           ? FK_Data_8
+                           : FK_Data_4;
+    addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel);
   }
 
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index e1b9720cdbfc5..bc6fdf7f2e4cd 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -50,6 +50,7 @@ class AMDGPUMCExpr : public MCTargetExpr {
     S_REL64,         // symbol@rel64
     S_ABS32_LO,      // symbol@abs32@lo
     S_ABS32_HI,      // symbol@abs32@hi
+    S_ABS64,         // symbol@abs64
   };
 
 private:
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 9b5a46395695d..f018f77bc83e1 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
   default:
     return false;
   case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
     SMovOp = AMDGPU::S_MOV_B32;
     break;
   case AMDGPU::V_MOV_B64_PSEUDO:
@@ -946,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
 
     // Copies and REG_SEQUENCE do not contribute to the final assembly
     // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
-    if (Inst->isCopy() || Inst->isRegSequence()) {
-      if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
-        if (!Inst->isCopy() ||
-            !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
-          Info.NumSVCopies++;
-          continue;
-        }
+    if (Inst->isRegSequence() &&
+        TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+      Info.NumSVCopies++;
+      continue;
+    }
+    if (Inst->isCopy()) {
+      const TargetRegisterClass *SrcRC, *DstRC;
+      std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI);
+      if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) &&
+          !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+        Info.NumSVCopies++;
+        continue;
       }
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0ed06c37507af..e172c0b63189b 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
   for (MachineInstr *Copy : CopiesToReplace)
     Copy->addImplicitDefUseOperands(*MF);
 
+  SetVector<MachineInstr *> ConstantFoldCandidates;
   for (FoldCandidate &Fold : FoldList) {
     assert(!Fold.isReg() || Fold.Def.OpToFold);
     if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
                         << static_cast<int>(Fold.UseOpNo) << " of "
                         << *Fold.UseMI);
 
-      if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) {
-        LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI);
-        Changed = true;
-      }
+      if (Fold.isImm())
+        ConstantFoldCandidates.insert(Fold.UseMI);
 
     } else if (Fold.Commuted) {
       // Restoring instruction's original operand order if fold has failed.
       TII->commuteInstruction(*Fold.UseMI, false);
     }
   }
+
+  for (MachineInstr *MI : ConstantFoldCandidates) {
+    if (tryConstantFoldOp(MI)) {
+      LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
+      Changed = true;
+    }
+  }
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 93fd767943679..0c76ff2ec5ea7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
 #include <optional>
 
 using namespace llvm;
+using namespace llvm::SDPatternMatch;
 
 #define DEBUG_TYPE "si-lower"
 
@@ -938,6 +940,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
   }
 
+  if (Subtarget->hasBF16TransInsts()) {
+    setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
+  }
+
   if (Subtarget->hasCvtPkF16F32Inst()) {
     setOperationAction(ISD::FP_ROUND,
                        {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
@@ -8162,6 +8168,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
   //   operand to the global variable.
+  if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
+    assert(GAFlags != SIInstrInfo::MO_NONE);
+
+    SDValue Ptr =
+        DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
+    return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
+  }
+
   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
   SDValue PtrHi;
   if (GAFlags == SIInstrInfo::MO_NONE)
@@ -8211,6 +8225,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   }
 
   if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
+    if (Subtarget->has64BitLiterals()) {
+      SDValue Addr = DAG.getTargetGlobalAddress(
+          GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
+      return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
+                     0);
+    }
+
     SDValue AddrLo = DAG.getTargetGlobalAddress(
         GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
     AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
@@ -9289,7 +9310,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_reloc_constant: {
-    Module *M = const_cast<Module *>(MF.getFunction().getParent());
+    Module *M = MF.getFunction().getParent();
     const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
     auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
     auto *RelocSymbol = cast<GlobalVariable>(
@@ -9315,6 +9336,44 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
                        Op.getOperand(3), IndexKeyi32);
   }
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
+    if (Op.getOperand(4).getValueType() == MVT::i64)
+      return SDValue();
+
+    SDLoc SL(Op);
+    auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+                        Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
+                        Op.getOperand(6)});
+  }
+  case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
+  case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
+  case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
+    EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
+                         ? MVT::i64
+                         : MVT::i32;
+    if (Op.getOperand(6).getValueType() == IndexKeyTy)
+      return SDValue();
+
+    SDLoc SL(Op);
+    auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+                        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+                        IndexKey, Op.getOperand(7),
+                        Op.getOperand(8)}); // No clamp operand
+  }
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
@@ -11074,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.getSizeInBits() == 64);
 
   SDLoc DL(Op);
-  SDValue Cond = Op.getOperand(0);
+  SDValue Cond = DAG.getFreeze(Op.getOperand(0));
 
   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
   SDValue One = DAG.getConstant(1, DL, MVT::i32);
@@ -12155,6 +12214,11 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
   if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
        bitOpWithConstantIsReducible(Opc, ValHi)) ||
       (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+    // We have 64-bit scalar and/or/xor, but do not have vector forms.
+    if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
+        !CRHS->user_begin()->isDivergent())
+      return SDValue();
+
     // If we need to materialize a 64-bit immediate, it will be split up later
     // anyway. Avoid creating the harder to understand 64-bit immediate
     // materialization.
@@ -14499,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
 // instead of a tree.
 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
-  assert(N->getOpcode() == ISD::ADD);
+  assert(N->isAnyAdd());
 
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
@@ -14532,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
     for (SDNode *User : LHS->users()) {
       // There is a use that does not feed into addition, so the multiply can't
       // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
-      if (User->getOpcode() != ISD::ADD)
+      if (!User->isAnyAdd())
         return SDValue();
 
       // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14644,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
 
     SDValue Hi = getHiHalf64(LHS, DAG);
     SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+    unsigned Opcode = N->getOpcode();
+    if (Opcode == ISD::PTRADD)
+      Opcode = ISD::ADD;
     SDValue AddHi =
-        DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+        DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
 
     SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
     return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -15119,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  if (N1.getOpcode() == ISD::ADD) {
-    // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
-    //    y is not, and (add y, z) is used only once.
-    // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
-    //    z is not, and (add y, z) is used only once.
-    // The goal is to move constant offsets to the outermost ptradd, to create
-    // more opportunities to fold offsets into memory instructions.
-    // Together with the generic combines in DAGCombiner.cpp, this also
-    // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
-    //
-    // This transform is here instead of in the general DAGCombiner as it can
-    // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
-    // AArch64's CPA.
-    SDValue X = N0;
-    SDValue Y = N1.getOperand(0);
-    SDValue Z = N1.getOperand(1);
-    if (N1.hasOneUse()) {
-      bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
-      bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
-      if (ZIsConstant != YIsConstant) {
-        // If both additions in the original were NUW, the new ones are as well.
-        SDNodeFlags Flags =
-            (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
-        if (YIsConstant)
-          std::swap(Y, Z);
+  // The following folds transform PTRADDs into regular arithmetic in cases
+  // where the PTRADD wouldn't be folded as an immediate offset into memory
+  // instructions anyway. They are target-specific in that other targets might
+  // prefer to not lose information about the pointer arithmetic.
+
+  // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
+  // Adapted from DAGCombiner::visitADDLikeCommutative.
+  SDValue V, K;
+  if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
+    SDNodeFlags ShlFlags = N1->getFlags();
+    // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
+    // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
+    // preserved.
+    SDNodeFlags NewShlFlags =
+        ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
+            ? SDNodeFlags::NoSignedWrap
+            : SDNodeFlags();
+    SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
+    DCI.AddToWorklist(Inner.getNode());
+    return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
+  }
+
+  // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
+  // performAddCombine.
+  if (N1.getOpcode() == ISD::MUL) {
+    if (Subtarget->hasMad64_32()) {
+      if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+        return Folded;
+    }
+  }
+
+  // If the 32 low bits of the constant are all zero, there is nothing to fold
+  // into an immediate offset, so it's better to eliminate the unnecessary
+  // addition for the lower 32 bits than to preserve the PTRADD.
+  // Analogous to a fold in performAddCombine.
+  if (VT == MVT::i64) {
+    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+      return Folded;
+  }
 
-        SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+  if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
+    // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
+    // global address GA and constant c, such that c can be folded into GA.
+    SDValue GAValue = N0.getOperand(0);
+    if (const GlobalAddressSDNode *GA =
+            dyn_cast<GlobalAddressSDNode>(GAValue)) {
+      if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
+        // If both additions in the original were NUW, reassociation preserves
+        // that.
+        SDNodeFlags Flags =
+            (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+        SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
         DCI.AddToWorklist(Inner.getNode());
-        return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+        return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
       }
     }
   }
 
+  if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
+    return SDValue();
+
+  // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+  //    y is not, and (add y, z) is used only once.
+  // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+  //    z is not, and (add y, z) is used only once.
+  // The goal is to move constant offsets to the outermost ptradd, to create
+  // more opportunities to fold offsets into memory instructions.
+  // Together with the generic combines in DAGCombiner.cpp, this also
+  // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+  //
+  // This transform is here instead of in the general DAGCombiner as it can
+  // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+  // AArch64's CPA.
+  SDValue X = N0;
+  SDValue Y = N1.getOperand(0);
+  SDValue Z = N1.getOperand(1);
+  bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+  bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+  // If both additions in the original were NUW, reassociation preserves that.
+  SDNodeFlags ReassocFlags =
+      (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+
+  if (ZIsConstant != YIsConstant) {
+    if (YIsConstant)
+      std::swap(Y, Z);
+    SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+    DCI.AddToWorklist(Inner.getNode());
+    return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
+  }
+
+  // If one of Y and Z is constant, they have been handled above. If both were
+  // constant, the addition would have been folded in SelectionDAG::getNode
+  // already. This ensures that the generic DAG combines won't undo the
+  // following reassociation.
+  assert(!YIsConstant && !ZIsConstant);
+
+  if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
+    // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
+    // y are uniform and z isn't.
+    // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
+    // z are uniform and y isn't.
+    // The goal is to push uniform operands up in the computation, so that they
+    // can be handled with scalar operations. We can't use reassociateScalarOps
+    // for this since it requires two identical commutative operations to
+    // reassociate.
+    if (Y->isDivergent())
+      std::swap(Y, Z);
+    SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+    DCI.AddToWorklist(UniformInner.getNode());
+    return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
+  }
+
   return SDValue();
 }
 
@@ -16848,12 +16996,63 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST,
   Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
 }
 
+static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
+                             KnownBits &Known, const APInt &DemandedElts,
+                             unsigned BFEWidth, bool SExt, unsigned Depth) {
+  const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
+  const MachineOperand &Src1 = MI.getOperand(2);
+
+  unsigned Src1Cst = 0;
+  if (Src1.isImm()) {
+    Src1Cst = Src1.getImm();
+  } else if (Src1.isReg()) {
+    auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
+    if (!Cst)
+      return;
+    Src1Cst = Cst->Value.getZExtValue();
+  } else {
+    return;
+  }
+
+  // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
+  // Width is always [22:16].
+  const unsigned Offset =
+      Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
+  const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
+
+  if (Width >= BFEWidth) // Ill-formed.
+    return;
+
+  VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                          Depth + 1);
+
+  Known = Known.extractBits(Width, Offset);
+
+  if (SExt)
+    Known = Known.sext(BFEWidth);
+  else
+    Known = Known.zext(BFEWidth);
+}
+
 void SITargetLowering::computeKnownBitsForTargetInstr(
     GISelValueTracking &VT, Register R, KnownBits &Known,
     const APInt &DemandedElts, const MachineRegisterInfo &MRI,
     unsigned Depth) const {
+  Known.resetAll();
   const MachineInstr *MI = MRI.getVRegDef(R);
   switch (MI->getOpcode()) {
+  case AMDGPU::S_BFE_I32:
+    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
+                            /*SExt=*/true, Depth);
+  case AMDGPU::S_BFE_U32:
+    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
+                            /*SExt=*/false, Depth);
+  case AMDGPU::S_BFE_I64:
+    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
+                            /*SExt=*/true, Depth);
+  case AMDGPU::S_BFE_U64:
+    return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
+                            /*SExt=*/false, Depth);
   case AMDGPU::G_INTRINSIC:
   case AMDGPU::G_INTRINSIC_CONVERGENT: {
     Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ce1359f03da6..2af0a575a8885 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
   llvm_unreachable("event type has no associated counter");
 }
 
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
-public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
-                  HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
-                  InstCounterType SmemAccessCounter)
-      : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
-        WaitEventMaskForInst(WaitEventMaskForInst),
-        SmemAccessCounter(SmemAccessCounter) {}
-
-  unsigned getWaitCountMax(InstCounterType T) const {
-    switch (T) {
-    case LOAD_CNT:
-      return Limits.LoadcntMax;
-    case DS_CNT:
-      return Limits.DscntMax;
-    case EXP_CNT:
-      return Limits.ExpcntMax;
-    case STORE_CNT:
-      return Limits.StorecntMax;
-    case SAMPLE_CNT:
-      return Limits.SamplecntMax;
-    case BVH_CNT:
-      return Limits.BvhcntMax;
-    case KM_CNT:
-      return Limits.KmcntMax;
-    case X_CNT:
-      return Limits.XcntMax;
-    default:
-      break;
-    }
-    return 0;
-  }
-
-  bool isSmemCounter(InstCounterType T) const {
-    return T == SmemAccessCounter || T == X_CNT;
-  }
-
-  unsigned getSgprScoresIdx(InstCounterType T) const {
-    assert(isSmemCounter(T) && "Invalid SMEM counter");
-    return T == X_CNT ? 1 : 0;
-  }
-
-  unsigned getScoreLB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
-    return ScoreLBs[T];
-  }
-
-  unsigned getScoreUB(InstCounterType T) const {
-    assert(T < NUM_INST_CNTS);
-    return ScoreUBs[T];
-  }
-
-  unsigned getScoreRange(InstCounterType T) const {
-    return getScoreUB(T) - getScoreLB(T);
-  }
-
-  unsigned getRegScore(int GprNo, InstCounterType T) const {
-    if (GprNo < NUM_ALL_VGPRS)
-      return VgprScores[T][GprNo];
-    return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
-  }
-
-  bool merge(const WaitcntBrackets &Other);
-
-  RegInterval getRegInterval(const MachineInstr *MI,
-                             const MachineRegisterInfo *MRI,
-                             const SIRegisterInfo *TRI,
-                             const MachineOperand &Op) const;
-
-  bool counterOutOfOrder(InstCounterType T) const;
-  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
-  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
-  void determineWait(InstCounterType T, RegInterval Interval,
-                     AMDGPU::Waitcnt &Wait) const;
-  void determineWait(InstCounterType T, int RegNo,
-                     AMDGPU::Waitcnt &Wait) const {
-    determineWait(T, {RegNo, RegNo + 1}, Wait);
-  }
-
-  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
-  void applyWaitcnt(InstCounterType T, unsigned Count);
-  void applyXcnt(const AMDGPU::Waitcnt &Wait);
-  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
-                     const MachineRegisterInfo *MRI, WaitEventType E,
-                     MachineInstr &MI);
-
-  unsigned hasPendingEvent() const { return PendingEvents; }
-  unsigned hasPendingEvent(WaitEventType E) const {
-    return PendingEvents & (1 << E);
-  }
-  unsigned hasPendingEvent(InstCounterType T) const {
-    unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
-    assert((HasPending != 0) == (getScoreRange(T) != 0));
-    return HasPending;
-  }
-
-  bool hasMixedPendingEvents(InstCounterType T) const {
-    unsigned Events = hasPendingEvent(T);
-    // Return true if more than one bit is set in Events.
-    return Events & (Events - 1);
-  }
-
-  bool hasPendingFlat() const {
-    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
-             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
-            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
-             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
-  }
-
-  void setPendingFlat() {
-    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
-    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
-  }
-
-  bool hasPendingGDS() const {
-    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
-  }
-
-  unsigned getPendingGDSWait() const {
-    return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
-  }
-
-  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
-
-  // Return true if there might be pending writes to the vgpr-interval by VMEM
-  // instructions with types different from V.
-  bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      if (VgprVmemTypes[RegNo] & ~(1 << V))
-        return true;
-    }
-    return false;
-  }
-
-  void clearVgprVmemTypes(RegInterval Interval) {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      VgprVmemTypes[RegNo] = 0;
-    }
-  }
-
-  void setStateOnFunctionEntryOrReturn() {
-    setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
-    PendingEvents |= WaitEventMaskForInst[STORE_CNT];
-  }
-
-  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
-    return LDSDMAStores;
-  }
-
-  bool hasPointSampleAccel(const MachineInstr &MI) const;
-  bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
-                                      RegInterval Interval) const;
-
-  void print(raw_ostream &) const;
-  void dump() const { print(dbgs()); }
-
-private:
-  struct MergeInfo {
-    unsigned OldLB;
-    unsigned OtherLB;
-    unsigned MyShift;
-    unsigned OtherShift;
-  };
-  static bool mergeScore(const MergeInfo &M, unsigned &Score,
-                         unsigned OtherScore);
-
-  void setScoreLB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreLBs[T] = Val;
-  }
-
-  void setScoreUB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreUBs[T] = Val;
-
-    if (T != EXP_CNT)
-      return;
-
-    if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
-      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
-  }
-
-  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
-    setScoreByInterval({GprNo, GprNo + 1}, T, Val);
-  }
-
-  void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
-                          unsigned Score);
-
-  void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
-                         const MachineRegisterInfo *MRI,
-                         const MachineOperand &Op, InstCounterType CntTy,
-                         unsigned Val);
-
-  const GCNSubtarget *ST = nullptr;
-  InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
-  HardwareLimits Limits = {};
-  const unsigned *WaitEventMaskForInst;
-  InstCounterType SmemAccessCounter;
-  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
-  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
-  unsigned PendingEvents = 0;
-  // Remember the last flat memory operation.
-  unsigned LastFlat[NUM_INST_CNTS] = {0};
-  // Remember the last GDS operation.
-  unsigned LastGDS = 0;
-  // wait_cnt scores for every vgpr.
-  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
-  int VgprUB = -1;
-  int SgprUB = -1;
-  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
-  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
-  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
-  // X_CNT score.
-  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
-  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
-  // write to each vgpr.
-  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
-  // Store representative LDS DMA operations. The only useful info here is
-  // alias info. One store is kept per unique AAInfo.
-  SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
-};
+class WaitcntBrackets;
 
 // This abstracts the logic for generating and updating S_WAIT* instructions
 // away from the analysis that determines where they are needed. This was
@@ -640,8 +407,13 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 };
 
 class SIInsertWaitcnts {
+public:
+  const GCNSubtarget *ST;
+  InstCounterType SmemAccessCounter;
+  InstCounterType MaxCounter;
+  const unsigned *WaitEventMaskForInst;
+
 private:
-  const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
@@ -657,8 +429,6 @@ class SIInsertWaitcnts {
     bool Dirty = true;
   };
 
-  InstCounterType SmemAccessCounter;
-
   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
   bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -675,7 +445,7 @@ class SIInsertWaitcnts {
   // message.
   DenseSet<MachineInstr *> ReleaseVGPRInsts;
 
-  InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+  HardwareLimits Limits;
 
 public:
   SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -686,6 +456,30 @@ class SIInsertWaitcnts {
     (void)ForceVMCounter;
   }
 
+  unsigned getWaitCountMax(InstCounterType T) const {
+    switch (T) {
+    case LOAD_CNT:
+      return Limits.LoadcntMax;
+    case DS_CNT:
+      return Limits.DscntMax;
+    case EXP_CNT:
+      return Limits.ExpcntMax;
+    case STORE_CNT:
+      return Limits.StorecntMax;
+    case SAMPLE_CNT:
+      return Limits.SamplecntMax;
+    case BVH_CNT:
+      return Limits.BvhcntMax;
+    case KM_CNT:
+      return Limits.KmcntMax;
+    case X_CNT:
+      return Limits.XcntMax;
+    default:
+      break;
+    }
+    return 0;
+  }
+
   bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
   bool isPreheaderToFlush(MachineBasicBlock &MBB,
                           const WaitcntBrackets &ScoreBrackets);
@@ -791,6 +585,211 @@ class SIInsertWaitcnts {
                             WaitcntBrackets &ScoreBrackets);
 };
 
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+
+  bool isSmemCounter(InstCounterType T) const {
+    return T == Context->SmemAccessCounter || T == X_CNT;
+  }
+
+  unsigned getSgprScoresIdx(InstCounterType T) const {
+    assert(isSmemCounter(T) && "Invalid SMEM counter");
+    return T == X_CNT ? 1 : 0;
+  }
+
+  unsigned getScoreLB(InstCounterType T) const {
+    assert(T < NUM_INST_CNTS);
+    return ScoreLBs[T];
+  }
+
+  unsigned getScoreUB(InstCounterType T) const {
+    assert(T < NUM_INST_CNTS);
+    return ScoreUBs[T];
+  }
+
+  unsigned getScoreRange(InstCounterType T) const {
+    return getScoreUB(T) - getScoreLB(T);
+  }
+
+  unsigned getRegScore(int GprNo, InstCounterType T) const {
+    if (GprNo < NUM_ALL_VGPRS)
+      return VgprScores[T][GprNo];
+    return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+  }
+
+  bool merge(const WaitcntBrackets &Other);
+
+  RegInterval getRegInterval(const MachineInstr *MI,
+                             const MachineRegisterInfo *MRI,
+                             const SIRegisterInfo *TRI,
+                             const MachineOperand &Op) const;
+
+  bool counterOutOfOrder(InstCounterType T) const;
+  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+
+  void determineWait(InstCounterType T, RegInterval Interval,
+                     AMDGPU::Waitcnt &Wait) const;
+  void determineWait(InstCounterType T, int RegNo,
+                     AMDGPU::Waitcnt &Wait) const {
+    determineWait(T, {RegNo, RegNo + 1}, Wait);
+  }
+
+  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+  void applyWaitcnt(InstCounterType T, unsigned Count);
+  void applyXcnt(const AMDGPU::Waitcnt &Wait);
+  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+                     const MachineRegisterInfo *MRI, WaitEventType E,
+                     MachineInstr &MI);
+
+  unsigned hasPendingEvent() const { return PendingEvents; }
+  unsigned hasPendingEvent(WaitEventType E) const {
+    return PendingEvents & (1 << E);
+  }
+  unsigned hasPendingEvent(InstCounterType T) const {
+    unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
+    assert((HasPending != 0) == (getScoreRange(T) != 0));
+    return HasPending;
+  }
+
+  bool hasMixedPendingEvents(InstCounterType T) const {
+    unsigned Events = hasPendingEvent(T);
+    // Return true if more than one bit is set in Events.
+    return Events & (Events - 1);
+  }
+
+  bool hasPendingFlat() const {
+    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+  }
+
+  void setPendingFlat() {
+    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+  }
+
+  bool hasPendingGDS() const {
+    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
+  }
+
+  unsigned getPendingGDSWait() const {
+    return std::min(getScoreUB(DS_CNT) - LastGDS,
+                    Context->getWaitCountMax(DS_CNT) - 1);
+  }
+
+  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
+
+  // Return true if there might be pending writes to the vgpr-interval by VMEM
+  // instructions with types different from V.
+  bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
+    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+      assert(RegNo < NUM_ALL_VGPRS);
+      if (VgprVmemTypes[RegNo] & ~(1 << V))
+        return true;
+    }
+    return false;
+  }
+
+  void clearVgprVmemTypes(RegInterval Interval) {
+    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+      assert(RegNo < NUM_ALL_VGPRS);
+      VgprVmemTypes[RegNo] = 0;
+    }
+  }
+
+  void setStateOnFunctionEntryOrReturn() {
+    setScoreUB(STORE_CNT,
+               getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
+    PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
+  }
+
+  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+    return LDSDMAStores;
+  }
+
+  bool hasPointSampleAccel(const MachineInstr &MI) const;
+  bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+                                      RegInterval Interval) const;
+
+  void print(raw_ostream &) const;
+  void dump() const { print(dbgs()); }
+
+private:
+  struct MergeInfo {
+    unsigned OldLB;
+    unsigned OtherLB;
+    unsigned MyShift;
+    unsigned OtherShift;
+  };
+  static bool mergeScore(const MergeInfo &M, unsigned &Score,
+                         unsigned OtherScore);
+
+  void setScoreLB(InstCounterType T, unsigned Val) {
+    assert(T < NUM_INST_CNTS);
+    ScoreLBs[T] = Val;
+  }
+
+  void setScoreUB(InstCounterType T, unsigned Val) {
+    assert(T < NUM_INST_CNTS);
+    ScoreUBs[T] = Val;
+
+    if (T != EXP_CNT)
+      return;
+
+    if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
+      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
+  }
+
+  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+    setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+  }
+
+  void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
+                          unsigned Score);
+
+  void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
+                         const MachineRegisterInfo *MRI,
+                         const MachineOperand &Op, InstCounterType CntTy,
+                         unsigned Val);
+
+  const SIInsertWaitcnts *Context;
+
+  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+  unsigned PendingEvents = 0;
+  // Remember the last flat memory operation.
+  unsigned LastFlat[NUM_INST_CNTS] = {0};
+  // Remember the last GDS operation.
+  unsigned LastGDS = 0;
+  // wait_cnt scores for every vgpr.
+  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+  int VgprUB = -1;
+  int SgprUB = -1;
+  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+  // X_CNT score.
+  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+  // write to each vgpr.
+  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+  // Store representative LDS DMA operations. The only useful info here is
+  // alias info. One store is kept per unique AAInfo.
+  SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+};
+
 class SIInsertWaitcntsLegacy : public MachineFunctionPass {
 public:
   static char ID;
@@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
 
   RegInterval Result;
 
-  MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
+  MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
   unsigned RegIdx = TRI->getHWRegIndex(MCReg);
   assert(isUInt<8>(RegIdx));
 
@@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
 // this at compile time, so we have to assume it might be applied if the
 // instruction supports it).
 bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
-  if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+  if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
     return false;
 
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
                                     WaitEventType E, MachineInstr &Inst) {
-  InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+  InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
 
   unsigned UB = getScoreUB(T);
   unsigned CurrScore = UB + 1;
@@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 }
 
 void WaitcntBrackets::print(raw_ostream &OS) const {
+  const GCNSubtarget *ST = Context->ST;
+
   OS << '\n';
-  for (auto T : inst_counter_types(MaxCounter)) {
+  for (auto T : inst_counter_types(Context->MaxCounter)) {
     unsigned SR = getScoreRange(T);
 
     switch (T) {
@@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
     // s_waitcnt instruction.
     if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
       if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
-          !ST->hasFlatLgkmVMemCountInOrder()) {
+          !Context->ST->hasFlatLgkmVMemCountInOrder()) {
         // If there is a pending FLAT operation, and this is a VMem or LGKM
         // waitcnt and the target can report early completion, then we need
         // to force a waitcnt 0.
@@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
         // If a counter has been maxed out avoid overflow by waiting for
         // MAX(CounterType) - 1 instead.
         unsigned NeededWait =
-            std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+            std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
         addWait(Wait, T, NeededWait);
       }
     }
@@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
     setScoreLB(T, UB);
-    PendingEvents &= ~WaitEventMaskForInst[T];
+    PendingEvents &= ~Context->WaitEventMaskForInst[T];
   }
 }
 
@@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
 // the decrement may go out of order.
 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   // Scalar memory read always can go out of order.
-  if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+  if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
       (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
     return true;
   return hasMixedPendingEvents(T);
@@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
   VgprUB = std::max(VgprUB, Other.VgprUB);
   SgprUB = std::max(SgprUB, Other.SgprUB);
 
-  for (auto T : inst_counter_types(MaxCounter)) {
+  for (auto T : inst_counter_types(Context->MaxCounter)) {
     // Merge event flags for this counter
+    const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
     if (OtherEvents & ~OldEvents)
@@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
-  const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+  WaitEventMaskForInst = WCG->getWaitEventMask();
 
   SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
 
-  HardwareLimits Limits = {};
   if (ST->hasExtendedWaitCounts()) {
     Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
     Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
@@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
       BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
     }
 
-    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
-        ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
     NonKernelInitialState->setStateOnFunctionEntryOrReturn();
     BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
 
@@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
           *Brackets = *BI.Incoming;
       } else {
         if (!Brackets) {
-          Brackets = std::make_unique<WaitcntBrackets>(
-              ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+          Brackets = std::make_unique<WaitcntBrackets>(this);
         } else {
           // Reinitialize in-place. N.B. do not do this by assigning from a
           // temporary because the WaitcntBrackets class is large and it could
           // cause this function to use an unreasonable amount of stack space.
           Brackets->~WaitcntBrackets();
-          new (Brackets.get()) WaitcntBrackets(
-              ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+          new (Brackets.get()) WaitcntBrackets(this);
         }
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4c5f938831243..c8935f0cb6034 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
         if (!SafeToPropagate)
           break;
 
-        DefOp.setIsKill(false);
+        for (auto I = Def; I != MI; ++I)
+          I->clearRegisterKills(DefOp.getReg(), &RI);
       }
 
       MachineInstrBuilder Builder =
@@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
   }
 }
 
-static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
-  switch (Size) {
-  case 4:
-    return AMDGPU::SI_SPILL_A32_SAVE;
-  case 8:
-    return AMDGPU::SI_SPILL_A64_SAVE;
-  case 12:
-    return AMDGPU::SI_SPILL_A96_SAVE;
-  case 16:
-    return AMDGPU::SI_SPILL_A128_SAVE;
-  case 20:
-    return AMDGPU::SI_SPILL_A160_SAVE;
-  case 24:
-    return AMDGPU::SI_SPILL_A192_SAVE;
-  case 28:
-    return AMDGPU::SI_SPILL_A224_SAVE;
-  case 32:
-    return AMDGPU::SI_SPILL_A256_SAVE;
-  case 36:
-    return AMDGPU::SI_SPILL_A288_SAVE;
-  case 40:
-    return AMDGPU::SI_SPILL_A320_SAVE;
-  case 44:
-    return AMDGPU::SI_SPILL_A352_SAVE;
-  case 48:
-    return AMDGPU::SI_SPILL_A384_SAVE;
-  case 64:
-    return AMDGPU::SI_SPILL_A512_SAVE;
-  case 128:
-    return AMDGPU::SI_SPILL_A1024_SAVE;
-  default:
-    llvm_unreachable("unknown register size");
-  }
-}
-
 static unsigned getAVSpillSaveOpcode(unsigned Size) {
   switch (Size) {
   case 4:
@@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
   return AMDGPU::SI_SPILL_WWM_V32_SAVE;
 }
 
-static unsigned getVectorRegSpillSaveOpcode(Register Reg,
-                                            const TargetRegisterClass *RC,
-                                            unsigned Size,
-                                            const SIRegisterInfo &TRI,
-                                            const SIMachineFunctionInfo &MFI) {
-  bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
+    Register Reg, const TargetRegisterClass *RC, unsigned Size,
+    const SIMachineFunctionInfo &MFI) const {
+  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
 
   // Choose the right opcode if spilling a WWM register.
   if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
     return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
 
-  if (IsVectorSuperClass)
+  // TODO: Check if AGPRs are available
+  if (ST.hasMAIInsts())
     return getAVSpillSaveOpcode(Size);
 
-  return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
-                             : getVGPRSpillSaveOpcode(Size);
+  return getVGPRSpillSaveOpcode(Size);
 }
 
 void SIInstrInfo::storeRegToStackSlot(
@@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot(
     return;
   }
 
-  unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
-                                                SpillSize, RI, *MFI);
+  unsigned Opcode =
+      getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
   MFI->setHasSpilledVGPRs();
 
   BuildMI(MBB, MI, DL, get(Opcode))
@@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
   }
 }
 
-static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
-  switch (Size) {
-  case 4:
-    return AMDGPU::SI_SPILL_A32_RESTORE;
-  case 8:
-    return AMDGPU::SI_SPILL_A64_RESTORE;
-  case 12:
-    return AMDGPU::SI_SPILL_A96_RESTORE;
-  case 16:
-    return AMDGPU::SI_SPILL_A128_RESTORE;
-  case 20:
-    return AMDGPU::SI_SPILL_A160_RESTORE;
-  case 24:
-    return AMDGPU::SI_SPILL_A192_RESTORE;
-  case 28:
-    return AMDGPU::SI_SPILL_A224_RESTORE;
-  case 32:
-    return AMDGPU::SI_SPILL_A256_RESTORE;
-  case 36:
-    return AMDGPU::SI_SPILL_A288_RESTORE;
-  case 40:
-    return AMDGPU::SI_SPILL_A320_RESTORE;
-  case 44:
-    return AMDGPU::SI_SPILL_A352_RESTORE;
-  case 48:
-    return AMDGPU::SI_SPILL_A384_RESTORE;
-  case 64:
-    return AMDGPU::SI_SPILL_A512_RESTORE;
-  case 128:
-    return AMDGPU::SI_SPILL_A1024_RESTORE;
-  default:
-    llvm_unreachable("unknown register size");
-  }
-}
-
 static unsigned getAVSpillRestoreOpcode(unsigned Size) {
   switch (Size) {
   case 4:
@@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
   if (Size != 4)
     llvm_unreachable("unknown wwm register spill size");
 
-  if (IsVectorSuperClass)
+  if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
     return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
 
   return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
 }
 
-static unsigned
-getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
-                               unsigned Size, const SIRegisterInfo &TRI,
-                               const SIMachineFunctionInfo &MFI) {
-  bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode(
+    Register Reg, const TargetRegisterClass *RC, unsigned Size,
+    const SIMachineFunctionInfo &MFI) const {
+  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
 
   // Choose the right opcode if restoring a WWM register.
   if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
     return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
 
-  if (IsVectorSuperClass)
+  // TODO: Check if AGPRs are available
+  if (ST.hasMAIInsts())
     return getAVSpillRestoreOpcode(Size);
 
-  return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
-                             : getVGPRSpillRestoreOpcode(Size);
+  assert(!RI.isAGPRClass(RC));
+  return getVGPRSpillRestoreOpcode(Size);
 }
 
 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   }
 
   unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
-                                                   SpillSize, RI, *MFI);
+                                                   SpillSize, *MFI);
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
       .addFrameIndex(FrameIndex)           // vaddr
       .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
@@ -2273,6 +2202,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
     const MachineOperand &SrcOp = MI.getOperand(1);
     assert(!SrcOp.isFPImm());
+
+    if (ST.has64BitLiterals()) {
+      MI.setDesc(get(AMDGPU::S_MOV_B64));
+      break;
+    }
+
     APInt Imm(64, SrcOp.getImm());
     if (Imm.isIntN(32) || isInlineConstant(Imm)) {
       MI.setDesc(get(AMDGPU::S_MOV_B64));
@@ -2492,6 +2427,25 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
+  case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
+    MachineFunction &MF = *MBB.getParent();
+    Register Reg = MI.getOperand(0).getReg();
+    MachineOperand Op = MI.getOperand(1);
+
+    // Create a bundle so these instructions won't be re-ordered by the
+    // post-RA scheduler.
+    MIBundleBuilder Bundler(MBB, MI);
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+    if (Op.isGlobal())
+      Op.setOffset(Op.getOffset() + 4);
+    Bundler.append(
+        BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
+
+    finalizeBundle(MBB, Bundler.begin());
+
+    MI.eraseFromParent();
+    break;
+  }
   case AMDGPU::ENTER_STRICT_WWM: {
     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
     // Whole Wave Mode is entered.
@@ -2807,12 +2761,14 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
   if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
     if (!DefinedRC1)
       return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
-    return isLegalRegOperand(MI, OpIdx1, *MO0);
+    return isLegalRegOperand(MI, OpIdx1, *MO0) &&
+           (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1));
   }
   if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
     if (!DefinedRC0)
       return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
-    return isLegalRegOperand(MI, OpIdx0, *MO1);
+    return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) &&
+           isLegalRegOperand(MI, OpIdx0, *MO1);
   }
 
   // No need to check 64-bit literals since swapping does not bring new
@@ -2903,9 +2859,9 @@ bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc,
 
 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
                                         int64_t BrOffset) const {
-  // BranchRelaxation should never have to check s_setpc_b64 because its dest
-  // block is unanalyzable.
-  assert(BranchOp != AMDGPU::S_SETPC_B64);
+  // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
+  // because its dest block is unanalyzable.
+  assert(isSOPP(BranchOp) || isSOPK(BranchOp));
 
   // Convert to dwords.
   BrOffset /= 4;
@@ -2946,13 +2902,30 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  auto I = MBB.end();
+  auto &MCCtx = MF->getContext();
+
+  if (ST.hasAddPC64Inst()) {
+    MCSymbol *Offset =
+        MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
+    auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
+                     .addSym(Offset, MO_FAR_BRANCH_OFFSET);
+    MCSymbol *PostAddPCLabel =
+        MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
+    AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
+    auto *OffsetExpr = MCBinaryExpr::createSub(
+        MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
+        MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
+    Offset->setVariableValue(OffsetExpr);
+    return;
+  }
+
+  assert(RS && "RegScavenger required for long branching");
 
   // FIXME: Virtual register workaround for RegScavenger not working with empty
   // blocks.
   Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
-  auto I = MBB.end();
-
   // Note: as this is used after hazard recognizer we need to apply some hazard
   // workarounds directly.
   const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
@@ -2968,7 +2941,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
   ApplyHazardWorkarounds();
 
-  auto &MCCtx = MF->getContext();
   MCSymbol *PostGetPCLabel =
       MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
   GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
@@ -3507,6 +3479,10 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
                                         ? AMDGPU::V_FMAAK_F16_t16
                                         : AMDGPU::V_FMAAK_F16_fake16
                                   : AMDGPU::V_FMAAK_F16;
+  case AMDGPU::V_FMAC_F64_e32:
+  case AMDGPU::V_FMAC_F64_e64:
+  case AMDGPU::V_FMA_F64_e64:
+    return AMDGPU::V_FMAAK_F64;
   default:
     llvm_unreachable("invalid instruction");
   }
@@ -3535,6 +3511,10 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
                                         ? AMDGPU::V_FMAMK_F16_t16
                                         : AMDGPU::V_FMAMK_F16_fake16
                                   : AMDGPU::V_FMAMK_F16;
+  case AMDGPU::V_FMAC_F64_e32:
+  case AMDGPU::V_FMAC_F64_e64:
+  case AMDGPU::V_FMA_F64_e64:
+    return AMDGPU::V_FMAMK_F64;
   default:
     llvm_unreachable("invalid instruction");
   }
@@ -3613,7 +3593,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
       Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
       Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
-      Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
+      Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
+      Opc == AMDGPU::V_FMAC_F64_e64) {
     // Don't fold if we are using source or output modifiers. The new VOP2
     // instructions don't have them.
     if (hasAnyModifiersSet(UseMI))
@@ -3685,7 +3666,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
       if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
           Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
-          Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
+          Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
+          Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
         UseMI.untieRegOperand(
             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
 
@@ -3753,7 +3735,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
       if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
           Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
-          Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
+          Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
+          Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
         UseMI.untieRegOperand(
             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
 
@@ -4074,8 +4057,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
   const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
 
-  if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
-      !IsLegacy &&
+  if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
+      (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
       // If we have an SGPR input, we will violate the constant bus restriction.
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
@@ -6099,14 +6082,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
     if (Is64BitOp &&
         !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
-      if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+      if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
+          (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
         return false;
 
       // FIXME: We can use sign extended 64-bit literals, but only for signed
       //        operands. At the moment we do not know if an operand is signed.
       //        Such operand will be encoded as its low 32 bits and then either
       //        correctly sign extended or incorrectly zero extended by HW.
-      if (!Is64BitFPOp && (int32_t)Imm < 0)
+      //        If 64-bit literals are supported and the literal will be encoded
+      //        as full 64 bit we still can use it.
+      if (!Is64BitFPOp && (int32_t)Imm < 0 &&
+          (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
         return false;
     }
   }
@@ -6402,7 +6389,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
   if (OldSAddrIdx < 0)
     return false;
 
-  assert(isSegmentSpecificFLAT(Inst));
+  assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
 
   int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
   if (NewOpc < 0)
@@ -6426,7 +6413,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
   if (OldVAddrIdx >= 0) {
     MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
     VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
-    if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
+    if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
         !VAddrDef->getOperand(1).isImm() ||
         VAddrDef->getOperand(1).getImm() != 0)
       return false;
@@ -6479,7 +6466,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
 // FIXME: Remove this when SelectionDAG is obsoleted.
 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
                                        MachineInstr &MI) const {
-  if (!isSegmentSpecificFLAT(MI))
+  if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
     return;
 
   // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
@@ -9178,15 +9165,30 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     if (isDPP(MI))
       return DescSize;
     bool HasLiteral = false;
+    unsigned LiteralSize = 4;
     for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
       const MachineOperand &Op = MI.getOperand(I);
       const MCOperandInfo &OpInfo = Desc.operands()[I];
       if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
         HasLiteral = true;
+        if (ST.has64BitLiterals()) {
+          switch (OpInfo.OperandType) {
+          default:
+            break;
+          case AMDGPU::OPERAND_REG_IMM_FP64:
+            if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
+              LiteralSize = 8;
+            break;
+          case AMDGPU::OPERAND_REG_IMM_INT64:
+            if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
+              LiteralSize = 8;
+            break;
+          }
+        }
         break;
       }
     }
-    return HasLiteral ? DescSize + 4 : DescSize;
+    return HasLiteral ? DescSize + LiteralSize : DescSize;
   }
 
   // Check whether we have extra NSA words.
@@ -9277,13 +9279,16 @@ SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
 ArrayRef<std::pair<unsigned, const char *>>
 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   static const std::pair<unsigned, const char *> TargetFlags[] = {
-    { MO_GOTPCREL, "amdgpu-gotprel" },
-    { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
-    { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
-    { MO_REL32_LO, "amdgpu-rel32-lo" },
-    { MO_REL32_HI, "amdgpu-rel32-hi" },
-    { MO_ABS32_LO, "amdgpu-abs32-lo" },
-    { MO_ABS32_HI, "amdgpu-abs32-hi" },
+      {MO_GOTPCREL, "amdgpu-gotprel"},
+      {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
+      {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
+      {MO_GOTPCREL64, "amdgpu-gotprel64"},
+      {MO_REL32_LO, "amdgpu-rel32-lo"},
+      {MO_REL32_HI, "amdgpu-rel32-hi"},
+      {MO_REL64, "amdgpu-rel64"},
+      {MO_ABS32_LO, "amdgpu-abs32-lo"},
+      {MO_ABS32_HI, "amdgpu-abs32-hi"},
+      {MO_ABS64, "amdgpu-abs64"},
   };
 
   return ArrayRef(TargetFlags);
@@ -10390,10 +10395,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const {
   return TargetInstrInfo::isGlobalMemoryObject(MI);
 }
 
+bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
+  if (!isWMMA(MI) && !isSWMMAC(MI))
+    return false;
+
+  if (AMDGPU::isGFX1250(ST))
+    return AMDGPU::getWMMAIsXDL(MI.getOpcode());
+
+  return true;
+}
+
 bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
 
-  if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) ||
+  if (AMDGPU::isGFX12Plus(ST))
+    return isDOT(MI) || isXDLWMMA(MI);
+
+  if (!isMAI(MI) || isDGEMM(Opcode) ||
       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 9e84822bfc273..5e92921f3ea21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -33,6 +33,7 @@ class LiveVariables;
 class MachineDominatorTree;
 class MachineRegisterInfo;
 class RegScavenger;
+class SIMachineFunctionInfo;
 class TargetRegisterClass;
 class ScheduleHazardRecognizer;
 
@@ -214,16 +215,20 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     MO_GOTPCREL32_LO = 2,
     // MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI.
     MO_GOTPCREL32_HI = 3,
+    // MO_GOTPCREL64 -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
+    MO_GOTPCREL64 = 4,
     // MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO.
-    MO_REL32 = 4,
-    MO_REL32_LO = 4,
+    MO_REL32 = 5,
+    MO_REL32_LO = 5,
     // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
-    MO_REL32_HI = 5,
+    MO_REL32_HI = 6,
+    MO_REL64 = 7,
 
-    MO_FAR_BRANCH_OFFSET = 6,
+    MO_FAR_BRANCH_OFFSET = 8,
 
-    MO_ABS32_LO = 8,
-    MO_ABS32_HI = 9,
+    MO_ABS32_LO = 9,
+    MO_ABS32_HI = 10,
+    MO_ABS64 = 11,
   };
 
   explicit SIInstrInfo(const GCNSubtarget &ST);
@@ -283,6 +288,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
                                int64_t &ImmVal) const override;
 
+  unsigned getVectorRegSpillSaveOpcode(Register Reg,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Size,
+                                       const SIMachineFunctionInfo &MFI) const;
+  unsigned
+  getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
+                                 unsigned Size,
+                                 const SIMachineFunctionInfo &MFI) const;
+
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
@@ -863,6 +877,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
   }
 
+  bool isXDLWMMA(const MachineInstr &MI) const;
+
   bool isXDL(const MachineInstr &MI) const;
 
   static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); }
@@ -1097,7 +1113,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   // that will not require an additional 4-bytes; this function assumes that it
   // will.
   bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const {
-    assert(!MO.isReg() && "isInlineConstant called on register operand!");
     if (!MO.isImm())
       return false;
     return isInlineConstant(MO.getImm(), OperandType);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e5ce0a9d0e971..ab7d34002e9f1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -268,6 +268,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
   SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
 >;
 
+def SIpc_add_rel_offset64 : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET64",
+  SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
 def SIlds : SDNode<"AMDGPUISD::LDS",
   SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
 >;
@@ -1247,6 +1251,7 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
 def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
 def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
 
+def IndexKey32bit : CustomOperand<i32, 1>;
 def IndexKey16bit : CustomOperand<i32, 1>;
 def IndexKey8bit : CustomOperand<i32, 1>;
 
@@ -1302,6 +1307,9 @@ let PrintMethod = "printBitOp3" in
 def BitOp3 : NamedIntOperand<"bitop3">;
 def bitop3_0 : DefaultOperand<BitOp3, 0>;
 
+def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
+def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
+
 class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
   let OperandNamespace = "AMDGPU";
   let OperandType = "OPERAND_KIMM"#vt.Size;
@@ -1633,6 +1641,8 @@ def VOP3PMods  : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
 
 def VOP3PModsDOT  : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
 def VOP3PModsNeg  : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
+def VOP3PModsNegs  : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern?
+def VOP3PModsNegAbs  : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">;
 def WMMAOpSelVOP3PMods  : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
 
 def WMMAModsF32NegAbs  : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
@@ -1641,6 +1651,7 @@ def WMMAModsF16NegAbs  : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
 def WMMAVISrc  : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
 def SWMMACIndex8  : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
 def SWMMACIndex16  : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
+def SWMMACIndex32  : ComplexPattern<untyped, 2, "SelectSWMMACIndex32">;
 
 def VOP3OpSel  : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
 
@@ -2654,6 +2665,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
                                isModifierType<Src2VT>.ret,
                                HasOMod);
   field bit HasNeg = HasModifiers;
+  field bit HasMatrixReuse = 0;
 
   field bit HasSrc0Mods = HasModifiers;
   field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4419ce00b473c..2a6fcadd4c49c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1144,6 +1144,14 @@ def : GCNPat <
   (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
 >;
 
+def SI_PC_ADD_REL_OFFSET64 : SPseudoInstSI <
+  (outs SReg_64:$dst),
+  (ins si_ga:$ptr),
+  [(set SReg_64:$dst,
+      (i64 (SIpc_add_rel_offset64 tglobaladdr:$ptr)))]> {
+  let SubtargetPredicate = Has64BitLiterals;
+}
+
 def : GCNPat<
   (AMDGPUtrap timm:$trapid),
   (S_TRAP $trapid)
@@ -2465,7 +2473,6 @@ def : AMDGPUPat <
 >;
 
 let True16Predicate = NotHasTrue16BitInsts in {
-let SubtargetPredicate = isNotGFX9Plus in {
 def : ROTRPattern <V_ALIGNBIT_B32_e64>;
 
 def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
@@ -2475,35 +2482,6 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
 def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
           (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
-} // isNotGFX9Plus
-
-let SubtargetPredicate = isGFX9GFX10 in {
-def : GCNPat <
-        (rotr i32:$src0, i32:$src1),
-        (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
-                                  /* src1_modifiers */ 0, $src0,
-                                  /* src2_modifiers */ 0,
-                                  $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
-foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
-               (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
-def : GCNPat<pat,
-        (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */
-                                  (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
-                                  0, /* src1_modifiers */
-                                  (i32 (EXTRACT_SUBREG (i64 $src0), sub0)),
-                                  0, /* src2_modifiers */
-                                  $src1, /* clamp */ 0, /* op_sel */ 0)
->;
-
-def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
-        (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
-                                  /* src1_modifiers */ 0, $src1,
-                                  /* src2_modifiers */ 0,
-                                  $src2, /* clamp */ 0, /* op_sel */ 0)
->;
-} // isGFX9GFX10
 } // end True16Predicate = NotHasTrue16BitInsts
 
 let True16Predicate = UseRealTrue16Insts in {
@@ -3104,8 +3082,6 @@ def : GCNPat <
                     (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
 >;
 
-// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped
-// to V_PERM_B32.
 let True16Predicate = NotHasTrue16BitInsts in
 def : GCNPat <
   (i32 (bswap i32:$a)),
@@ -3581,20 +3557,15 @@ def : GCNPat <
 
 // Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
 // Special case, can use V_ALIGNBIT (always uses encoded literal)
-let True16Predicate = NotHasTrue16BitInsts in {
-defvar BuildVectorToAlignBitPat =
+let True16Predicate = NotHasTrue16BitInsts in
+def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector>
     (Ty !if(!eq(Ty, i16),
       (Ty (trunc (srl VGPR_32:$a, (i32 16)))),
       (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))),
-    (Ty VGPR_32:$b)));
-
-let SubtargetPredicate = isNotGFX9Plus in
-def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>;
-
-let SubtargetPredicate = isGFX9GFX10 in
-def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>;
-} //True16Predicate = NotHasTrue16BitInsts
+    (Ty VGPR_32:$b))),
+    (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))
+>;
 
 let True16Predicate = UseFakeTrue16Insts in
 def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b0d6fd95cd271..5097ac03954d5 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   MachineBasicBlock::iterator E = MBB->end();
   MachineBasicBlock::iterator MBBI = MI.getIterator();
   ++MBBI;
-  const SITargetLowering *TLI =
-    static_cast<const SITargetLowering *>(STM->getTargetLowering());
+  const SITargetLowering *TLI = STM->getTargetLowering();
 
   for ( ; MBBI != E; ++MBBI) {
     MachineInstr &MINext = *MBBI;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 67ad28661da43..75ce67c00228d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -42,7 +42,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
       WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
       GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
-  const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
+  const GCNSubtarget &ST = *STI;
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
   MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9173041a7bccd..fa2b8db6ba55a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
   return 0;
 }
 
-unsigned
-SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
-                                   const TargetRegisterClass &RC) const {
+unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+                                            const TargetRegisterClass &RC,
+                                            bool IncludeCalls) const {
   for (MCPhysReg Reg : reverse(RC.getRegisters()))
-    if (MRI.isPhysRegUsed(Reg))
+    if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
       return getHWRegIndex(Reg) + 1;
   return 0;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 06a7a17b0246b..0008e5f8cf3b4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,9 +486,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                                      unsigned SubReg) const;
 
   // \returns a number of registers of a given \p RC used in a function.
-  // Does not go inside function calls.
+  // Does not go inside function calls. If \p IncludeCalls is true, it will
+  // include registers that may be clobbered by calls.
   unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass &RC) const;
+                              const TargetRegisterClass &RC,
+                              bool IncludeCalls = true) const;
 
   std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
     return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index d24c301fc1e51..c194e5c255d4d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1294,6 +1294,7 @@ def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">;
 def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">;
 def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">;
 def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">;
+def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">;
 def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">;
 def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">;
 
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 1679cee320067..ef8faffa5f557 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -66,6 +66,13 @@ def Write4PassDGEMM : SchedWrite;
 def Write8PassDGEMM : SchedWrite;
 def Write16PassDGEMM : SchedWrite;
 
+// WMMA/SWMMA instructions
+def WriteXDL2PassWMMA : SchedWrite;
+def WriteXDL4PassWMMA : SchedWrite;
+def Write4PassWMMA : SchedWrite;
+def Write8PassWMMA : SchedWrite;
+def Write16PassWMMA : SchedWrite;
+
 // Scalar float instructions
 def WriteSFPU : SchedWrite;
 
@@ -459,6 +466,15 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
 
 multiclass GFX125xCommonWriteRes {
 
+let ReleaseAtCycles = [8] in
+def : HWWriteRes<WriteXDL2PassWMMA, [HWXDL], 8>;
+let ReleaseAtCycles = [16] in
+def : HWWriteRes<WriteXDL4PassWMMA, [HWXDL], 16>;
+
+def : HWWriteRes<Write4PassWMMA,  [HWVALU], 16>;
+def : HWWriteRes<Write8PassWMMA,  [HWVALU], 32>;
+def : HWWriteRes<Write16PassWMMA, [HWVALU], 64>;
+
 def : HWWriteRes<Write32Bit,             [HWVALU, HWRC],   5>;
 def : HWWriteRes<WriteFloatCvt,          [HWVALU, HWRC],   5>;
 def : HWWriteRes<WriteTrans32,           [HWTransVALU, HWRC],   7>;
@@ -476,6 +492,11 @@ def : HWWriteRes<WriteVMEM,              [HWVMEM,   HWRC], 320>;
 def : HWWriteRes<WriteBarrier,           [HWBranch],       2000>;
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
+
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>;
+def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>;
+def : InstRW<[Write4PassWMMA],    (instregex "^V_WMMA_F32_16X16X4_F32_w32")>;
+def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>;
 } // End GFX125xCommonWriteRes
 
 let SchedModel = GFX1250SpeedModel in {
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index fd39b8a1350c6..7a519117f2482 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -463,6 +463,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
     case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
       NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
       break;
+    case AMDGPU::V_FMA_F64_e64:
+      if (ST->hasFmaakFmamkF64Insts())
+        NewOpcode = AMDGPU::V_FMAAK_F64;
+      break;
     }
   }
 
@@ -497,6 +501,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
     case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
       NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
       break;
+    case AMDGPU::V_FMA_F64_e64:
+      if (ST->hasFmaakFmamkF64Insts())
+        NewOpcode = AMDGPU::V_FMAMK_F64;
+      break;
     }
   }
 
@@ -961,7 +969,9 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
           MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
           MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
           MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
-          MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
+          MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 ||
+          (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 &&
+           ST->hasFmaakFmamkF64Insts())) {
         shrinkMadFma(MI);
         continue;
       }
@@ -1058,7 +1068,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
       // fold an immediate into the shrunk instruction as a literal operand. In
       // GFX10 VOP3 instructions can take a literal operand anyway, so there is
       // no advantage to doing this.
-      if (ST->hasVOP3Literal() && !IsPostRA)
+      // However, if 64-bit literals are allowed we still need to shrink it
+      // for such literal to be able to fold.
+      if (ST->hasVOP3Literal() &&
+          (!ST->has64BitLiterals() || AMDGPU::isTrue16Inst(MI.getOpcode())) &&
+          !IsPostRA)
         continue;
 
       if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2472b76fcf02c..e103ccc2f00e6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -154,6 +154,10 @@ class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
   let has_sdst = 0;
 }
 
+class SOP1_1_REGIMM64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs), (ins SSrc_b64:$src0), "$src0", pattern> {
+  let has_sdst = 0;
+}
 
 class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
   (ops node:$src0),
@@ -317,6 +321,9 @@ let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
 
 let isBranch = 1, isIndirectBranch = 1 in {
 def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
+
+let SubtargetPredicate = HasAddPC64Inst in
+def S_ADD_PC_I64 : SOP1_1_REGIMM64 <"s_add_pc_i64">;
 } // End isBranch = 1, isIndirectBranch = 1
 
 let isReturn = 1 in {
@@ -2130,6 +2137,9 @@ defm S_GET_BARRIER_STATE_IMM      : SOP1_IMM_Real_gfx12<0x050>;
 defm S_ALLOC_VGPR                 : SOP1_Real_gfx12<0x053>;
 defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12<0x058>;
 
+// GFX1250
+defm S_ADD_PC_I64                 : SOP1_Real_gfx12<0x04b>;
+
 //===----------------------------------------------------------------------===//
 // SOP1 - GFX1150, GFX12
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a32078cc403e7..77258810dd68c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
 #define GET_MIMGOffsetMappingTable_IMPL
 #define GET_MIMGG16MappingTable_IMPL
 #define GET_MAIInstInfoTable_IMPL
+#define GET_WMMAInstInfoTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
   return Info && Info->is_gfx940_xdl;
 }
 
+bool getWMMAIsXDL(unsigned Opc) {
+  const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc);
+  return Info ? Info->is_wmma_xdl : false;
+}
+
 uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
   switch (EncodingVal) {
   case MFMAScaleFormats::FP6_E2M3:
@@ -639,6 +645,7 @@ bool isMAC(unsigned Opc) {
          Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 ||
          Opc == AMDGPU::V_MAC_F16_e64_vi ||
          Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
+         Opc == AMDGPU::V_FMAC_F64_e64_gfx12 ||
          Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
          Opc == AMDGPU::V_FMAC_F32_e64_gfx11 ||
          Opc == AMDGPU::V_FMAC_F32_e64_gfx12 ||
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6708e0a3f4549..c9d2c286bf237 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -119,6 +119,11 @@ struct True16D16Info {
   unsigned LoOp;
 };
 
+struct WMMAInstInfo {
+  uint16_t Opcode;
+  bool is_wmma_xdl;
+};
+
 #define GET_MIMGBaseOpcode_DECL
 #define GET_MIMGDim_DECL
 #define GET_MIMGEncoding_DECL
@@ -129,6 +134,7 @@ struct True16D16Info {
 #define GET_isMFMA_F8F6F4Table_DECL
 #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
 #define GET_True16D16Table_DECL
+#define GET_WMMAInstInfoTable_DECL
 #include "AMDGPUGenSearchableTables.inc"
 
 namespace IsaInfo {
@@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc);
 LLVM_READONLY
 bool getMAIIsGFX940XDL(unsigned Opc);
 
+LLVM_READONLY
+bool getWMMAIsXDL(unsigned Opc);
+
 // Get an equivalent BitOp3 for a binary logical \p Opc.
 // \returns BitOp3 modifier for the logical operation or zero.
 // Used in VOPD3 conversion.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 57857c688283d..1bbbb610305e9 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
 defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+
+let SubtargetPredicate = HasTanhInsts in
+defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 
 defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -527,8 +530,19 @@ defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
 defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
 defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 
+let SubtargetPredicate = HasTanhInsts in {
+defm V_TANH_F16  : VOP1Inst_t16 <"v_tanh_f16",  VOP_F16_F16, int_amdgcn_tanh>;
+}
+
 let SubtargetPredicate = HasBF16TransInsts in {
 defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
+defm V_RCP_BF16  : VOP1Inst_t16 <"v_rcp_bf16",  VOP_BF16_BF16, AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
+defm V_RSQ_BF16  : VOP1Inst_t16 <"v_rsq_bf16",  VOP_BF16_BF16, AMDGPUrsq>;
+defm V_LOG_BF16  : VOP1Inst_t16 <"v_log_bf16",  VOP_BF16_BF16, AMDGPUlogf16>;
+defm V_EXP_BF16  : VOP1Inst_t16 <"v_exp_bf16",  VOP_BF16_BF16, AMDGPUexpf16>;
+defm V_SIN_BF16  : VOP1Inst_t16 <"v_sin_bf16",  VOP_BF16_BF16, AMDGPUsin>;
+defm V_COS_BF16  : VOP1Inst_t16 <"v_cos_bf16",  VOP_BF16_BF16, AMDGPUcos>;
 }
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -1131,12 +1145,21 @@ defm V_CVT_F32_F16           : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
 
 defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
 
+defm V_TANH_F32              : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
+defm V_TANH_F16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
 defm V_TANH_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
 defm V_CVT_F32_BF16          : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
 defm V_CVT_PK_F16_FP8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
 defm V_CVT_PK_F16_BF8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
 defm V_CVT_F16_FP8           : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
 defm V_CVT_F16_BF8           : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
+defm V_RCP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
+defm V_SQRT_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
+defm V_RSQ_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
+defm V_LOG_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
+defm V_EXP_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;
+defm V_SIN_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>;
+defm V_COS_BF16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 25c6cbc3e1ab5..030a6e1e978c1 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -175,10 +175,14 @@ multiclass VOP2Inst_e64<string opName,
     def _e64 : VOP3InstBase <opName, P, node, 1>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-    let SubtargetPredicate = isGFX11Plus in {
-      if P.HasExtVOP3DPP then
-        def _e64_dpp  : VOP3_DPP_Pseudo <opName, P>;
-    } // End SubtargetPredicate = isGFX11Plus
+    if P.HasExtVOP3DPP then
+      def _e64_dpp  : VOP3_DPP_Pseudo <opName, P> {
+        let SubtargetPredicate = isGFX11Plus;
+      }
+    else if P.HasExt64BitDPP then
+      def _e64_dpp  : VOP3_DPP_Pseudo <opName, P> {
+        let OtherPredicates = [HasDPALU_DPP];
+      }
 }
 
 multiclass VOP2Inst_e64_VOPD<string opName,
@@ -1492,7 +1496,9 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
     VOP2_DPP<op, ps, opName, p, 1> {
   let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = ps.SubtargetPredicate;
-  let OtherPredicates = ps.OtherPredicates;
+  let OtherPredicates = !listconcat(ps.OtherPredicates,
+      !if(p.HasExt64BitDPP, [HasDPALU_DPP], []),
+      !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []));
 }
 
 class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget,
@@ -1832,6 +1838,9 @@ let SubtargetPredicate = isGFX12Plus in {
     V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">;
 } // End SubtargetPredicate = isGFX12Plus
 
+let SubtargetPredicate = HasFmacF64Inst in
+defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>;
+
 defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>;
 defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 75c531913ded1..2e7f25b67fb63 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -224,12 +224,6 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
                                                    fshr, null_frag>;
 
 defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
-
-// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32.
-// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored.
-defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
-defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
-
 let True16Predicate = UseRealTrue16Insts in
 defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
 let True16Predicate = UseFakeTrue16Insts in
@@ -1960,9 +1954,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
-defm V_ALIGNBIT_B32_opsel  : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
-defm V_ALIGNBYTE_B32_opsel  : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
-
 defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
 
 let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
@@ -2113,8 +2104,8 @@ defm V_BFI_B32         : VOP3_Real_gfx6_gfx7_gfx10<0x14a>;
 defm V_FMA_F32         : VOP3_Real_gfx6_gfx7_gfx10<0x14b>;
 defm V_FMA_F64         : VOP3_Real_gfx6_gfx7_gfx10<0x14c>;
 defm V_LERP_U8         : VOP3_Real_gfx6_gfx7_gfx10<0x14d>;
-defm V_ALIGNBIT_B32    : VOP3_Real_gfx6_gfx7<0x14e>;
-defm V_ALIGNBYTE_B32   : VOP3_Real_gfx6_gfx7<0x14f>;
+defm V_ALIGNBIT_B32    : VOP3_Real_gfx6_gfx7_gfx10<0x14e>;
+defm V_ALIGNBYTE_B32   : VOP3_Real_gfx6_gfx7_gfx10<0x14f>;
 defm V_MULLIT_F32      : VOP3_Real_gfx6_gfx7_gfx10<0x150>;
 defm V_MIN3_F32        : VOP3_Real_gfx6_gfx7_gfx10<0x151>;
 defm V_MIN3_I32        : VOP3_Real_gfx6_gfx7_gfx10<0x152>;
@@ -2257,17 +2248,6 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
   }
 }
 
-// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi.
-// The following is created to support that.
-multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> {
-  defvar psName = opName#"_e64";
-  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI
-            VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> {
-              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName);
-              let AsmString = AsmName # ps.AsmOperands;
-            }
-}
-
 } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
 
 defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
@@ -2287,10 +2267,8 @@ defm V_BFI_B32          : VOP3_Real_vi <0x1ca>;
 defm V_FMA_F32          : VOP3_Real_vi <0x1cb>;
 defm V_FMA_F64          : VOP3_Real_vi <0x1cc>;
 defm V_LERP_U8          : VOP3_Real_vi <0x1cd>;
-let SubtargetPredicate = isGFX8Only in {
 defm V_ALIGNBIT_B32     : VOP3_Real_vi <0x1ce>;
 defm V_ALIGNBYTE_B32    : VOP3_Real_vi <0x1cf>;
-}
 defm V_MIN3_F32         : VOP3_Real_vi <0x1d0>;
 defm V_MIN3_I32         : VOP3_Real_vi <0x1d1>;
 defm V_MIN3_U32         : VOP3_Real_vi <0x1d2>;
@@ -2335,9 +2313,6 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16"
 defm V_MAD_LEGACY_U16       : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16",       "v_mad_legacy_u16">;
 defm V_MAD_LEGACY_I16       : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16",       "v_mad_legacy_i16">;
 
-defm V_ALIGNBIT_B32_opsel   : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
-defm V_ALIGNBYTE_B32_opsel  : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
-
 defm V_MAD_F16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
 defm V_MAD_U16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
 defm V_MAD_I16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 31997f803dfc6..e51e9574f8de0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1223,6 +1223,8 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> {
   Instruction Opcode2Addr = TwoAddr;
   Instruction Opcode3Addr = ThreeAddr;
   Predicate WaveSizePredicate;
+  Predicate SubtargetPredicate;
+  field bit is_wmma_xdl;
 }
 
 def WMMAOpcode : GenericEnum {
@@ -1315,28 +1317,39 @@ let WaveSizePredicate = isWave64 in {
 }
 
 class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
-                        bit _IsIU, bit _IsFP8BF8>
+                             bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
+                             bit _HasMatrixReuse = 0, bit _IsF4 = 0>
     : VOP3P_Profile<VOPProfile<ArgTy>> {
   bit IsIU = _IsIU;
-  bit IsFP8BF8 = _IsFP8BF8;
-  bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
+  bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B
+  bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32));
 
   int IndexType = _IndexType;
+  let HasMatrixReuse = _HasMatrixReuse;
 
+  bit HasIModOp = _Has_ImodOp;
+  let HasClamp = !and(IsIU, !not(HasIModOp));
   let IsPacked = 1;
   let IsWMMA = !not(_IsSWMMAC);
   let IsSWMMAC = _IsSWMMAC;
 
-  bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
-  bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
+  bit IsAB_F64  = !or(!eq(ArgTy[1], v2f64), !eq(ArgTy[1], v4f64));
+  bit IsAB_F32  = !eq(ArgTy[1], v2f32);
+  bit IsAB_F16 = !or(!eq(ArgTy[1], v16f16), !eq(ArgTy[1], v8f16), !eq(ArgTy[1], v4f16));
+  bit IsAB_BF16 = !or(!eq(ArgTy[1], v16i16), !eq(ArgTy[1], v8i16), !eq(ArgTy[1], v4i16),
+                      !eq(ArgTy[1], v16bf16), !eq(ArgTy[1], v8bf16), !eq(ArgTy[1], v4bf16));
+  bit IsF16BF16 = !or(IsAB_F16, IsAB_BF16);
+
+  bit IsC_F64 = !eq(ArgTy[3], v8f64);
   bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
-  bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
+  bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16),
+                     !eq(ArgTy[3], v8bf16), !eq(ArgTy[3], v4bf16));
   bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
 
-  bit NegLo01 = !or(IsF16BF16, IsIU);
-  bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
-  bit NegHi01 = IsF16BF16;
-  bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+  bit NegLo01 = !not(NoABMods);
+  bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
+  bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1]
+  bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA);
   bit NegLoAny = !or(NegLo01, NegLo2);
   bit NegHiAny = !or(NegHi01, NegHi2);
 
@@ -1345,19 +1358,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   let Src1RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[2].Size);
   let Src2RC64 = !if(IsSWMMAC, DstRC,
                                !cast<RegisterOperand>("VISrc_"#ArgTy[3].Size#
-                                                      !cond(IsC_F32: "_f32",
-                                                            IsC_F16: "_f16",
+                                                      !cond(IsC_F64:  "_f64",
+                                                            IsC_F32:  "_f32",
+                                                            IsC_F16:  "_f16",
                                                             IsC_BF16: "_bf16",
                                                             1: "_b32")));
 
   // For f16 and bf16 matrices A and B, each element can be modified by
-  // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
+  // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but
+  // neg_hi[0:1] is ignored. For iu4 and iu8 matrices A and B neg_lo is
   // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
-  // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
-  // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
+  // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16, f32 and f64 matrix C
+  // each element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
 
   // Opcode             | src0/src1 - matrix A/B | src2 - matrix C or Index
   // ---------------------------------------------------------------------------
+  // wmma f64_f64       | neg_lo for neg A/B     | neg_lo = 1  neg C(f64)
+  //                    | neg_hi ignored         | neg_hi = 1  abs C(f64)
+  // ---------------------------------------------------------------------------
+  // wmma f32_f32       | neg_lo for neg A/B     | neg_lo = 1  neg C(f32)
+  //                    | neg_hi ignored         | neg_hi = 1  abs C(f32)
+  // ---------------------------------------------------------------------------
+  // wmma f32_xf32      | not allowed for xf32   | not allowed
+  // ---------------------------------------------------------------------------
   // wmma f32_f16       | both neg_lo,neg_hi = 1 | neg_lo = 1  neg C(f32)
   // wmma f32_bf16      | neg A/B (f16 or bf16)  | neg_hi = 1  abs C(f32)
   // ---------------------------------------------------------------------------
@@ -1368,7 +1391,10 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   //                    | neg_lo = 1 i4/i8(sext) | i32 matrices
   // ---------------------------------------------------------------------------
   // wmma f32_fp8/bf8   | not allowed for        | neg_lo = 1  neg C(f32)
-  // (4 instructions)   | f8 and bf8 matrices    | neg_hi = 1  abs C(f32)
+  //                    | fp8 and bf8 matrices   | neg_hi = 1  abs C(f32)
+  // ---------------------------------------------------------------------------
+  // wmma f16_fp8/bf8   | not allowed for        | neg_lo = 1  neg C(f16)
+  //                    | fp8 and bf8 matrices   | neg_hi = 1  abs C(f16)
   // ---------------------------------------------------------------------------
   // swmmac f32_f16     | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
   // swmmac f32_bf16    | neg A/B (f16 or bf16)  | A Index - matrix C is in dst
@@ -1380,103 +1406,153 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
   //                    | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
   // ---------------------------------------------------------------------------
   // swmmac f32_fp8/bf8 | not allowed for        | not allowed for sparse matrix
-  // (4 instructions)   | f8 and bf8 matrices    | A Index - matrix C is in dst
+  // swmmac f16_fp8/bf8 | f8 and bf8 matrices    | A Index - matrix C is in dst
+  // ---------------------------------------------------------------------------
 
   // pseudo
 
-  // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+  // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
   // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
   // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
   // f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
-
-  dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
-  dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
-  dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
+  dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers));
+  dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers));
+  dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers));
   dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
                        !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
-                       !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
-  dag Clamp = !if(IsIU, (ins Clamp0:$clamp), (ins));
+                       !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit),
+                       !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit));
+
+  dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins));
+  dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins));
   dag Neg = !cond(!and(NegLoAny, NegHiAny)             : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
                   !and(NegLoAny, !not(NegHiAny))       : (ins neg_lo0:$neg_lo),
                   !and(!not(NegLoAny), !not(NegHiAny)) : (ins));
 
   let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
                       !cond(IsWMMA   : !con(Src2Mods, (ins Src2RC64:$src2)),
-                            IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
-                      Clamp, Neg);
+                            IsSWMMAC : !con((ins DstRC:$srcTiedDef),
+                                             !if(!eq(IndexType, 32),
+                                                 (ins VRegSrc_64:$src2),
+                                                 (ins VRegSrc_32:$src2)),
+                                            IndexKey)),
+                      MatrixReuse, Clamp, Neg);
 
   // asm
 
   string IndexKeyAsm = !cond(!eq(IndexType, 0)  : "",
                              !eq(IndexType, 8)  : "$index_key_8bit",
-                             !eq(IndexType, 16) : "$index_key_16bit");
-  string ClampAsm = !if(IsIU, "$clamp", "");
+                             !eq(IndexType, 16) : "$index_key_16bit",
+                             !eq(IndexType, 32) : "$index_key_32bit");
+  string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", "");
+  string ClampAsm = !if(HasClamp, "$clamp", "");
   string NegAsm = !cond(!and(NegLoAny, NegHiAny)             : "$neg_lo$neg_hi",
                         !and(NegLoAny, !not(NegHiAny))       : "$neg_lo",
                         !and(!not(NegLoAny), !not(NegHiAny)) : "");
 
-  let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
+  let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm;
 
   // isel patterns
+  bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp));
+  bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp));
+  bit IsAB_F32F64_IMod1  = !and(!or(IsAB_F64, IsAB_F32), HasIModOp);
+  bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp);
+  dag Src0InPat  = !cond(IsAB_F32F64_IMod1  : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+                         IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0),
+                         IsAB_F16_IMod0     : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
+                         IsAB_BF16_IMod0    : (ins Src0VT:$src0),
+                         IsIU               : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+                         NoABMods           : (ins Src0VT:$src0));
+  dag Src0OutPat = !cond(IsAB_F32F64_IMod1  : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         IsAB_F16_IMod0     : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         IsAB_BF16_IMod0    : (ins (i32 8), Src0VT:$src0),
+                         IsIU               : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         NoABMods           : (ins Src0VT:$src0));
+  dag Src1InPat  = !cond(IsAB_F32F64_IMod1  : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+                         IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1),
+                         IsAB_F16_IMod0     : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
+                         IsAB_BF16_IMod0    : (ins Src1VT:$src1),
+                         IsIU               : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+                         NoABMods           : (ins Src1VT:$src1));
+  dag Src1OutPat = !cond(IsAB_F32F64_IMod1  : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         IsAB_F16_IMod0     : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         IsAB_BF16_IMod0    : (ins (i32 8), Src1VT:$src1),
+                         IsIU               : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         NoABMods           : (ins Src1VT:$src1));
+  bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32));
+  bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp));
+  bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp));
+  bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp));
+  bit IsIUXF32 = !or(IsIU, IsXF32);
+  dag Src2InPatWmma  = !cond(IsC_IMod1        : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2),
+                             IsC_F32_IMod0    : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+                             IsC_F16_IMod0    : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+                             IsC_BF16_IMod0   : (ins Src2VT:$src2),
+                             IsIUXF32         : (ins Src2VT:$src2),
+                             IsSWMMAC         : (ins));
+  dag Src2OutPatWmma = !cond(IsC_IMod1        : (ins i32:$src2_modifiers, Src2VT:$src2),
+                             IsC_F32_IMod0    : (ins i32:$src2_modifiers, Src2VT:$src2),
+                             IsC_F16_IMod0    : (ins i32:$src2_modifiers, Src2VT:$src2),
+                             IsC_BF16_IMod0   : (ins (i32 8), Src2VT:$src2),
+                             IsIUXF32         : (ins Src2VT:$src2),
+                             IsSWMMAC         : (ins));
+  dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins));
 
-  dag Src0InPat  = !cond(IsAB_F16  : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
-                         IsAB_BF16 : (ins Src0VT:$src0),
-                         IsIU      : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
-                         IsFP8BF8  : (ins Src0VT:$src0));
-  dag Src0OutPat = !cond(IsAB_F16  : (ins i32:$src0_modifiers, Src0VT:$src0),
-                         IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
-                         IsIU      : (ins i32:$src0_modifiers, Src0VT:$src0),
-                         IsFP8BF8  : (ins Src0VT:$src0));
-  dag Src1InPat  = !cond(IsAB_F16  : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
-                         IsAB_BF16 : (ins Src1VT:$src1),
-                         IsIU      : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
-                         IsFP8BF8  : (ins Src1VT:$src1));
-  dag Src1OutPat = !cond(IsAB_F16  : (ins i32:$src1_modifiers, Src1VT:$src1),
-                         IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
-                         IsIU      : (ins i32:$src1_modifiers, Src1VT:$src1),
-                         IsFP8BF8  : (ins Src1VT:$src1));
-  dag Src2InPatWmma  = !cond(IsC_F32  : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
-                             IsC_F16  : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
-                             IsC_BF16 : (ins Src2VT:$src2),
-                             IsIU     : (ins Src2VT:$src2),
-                             IsSWMMAC : (ins));
-  dag Src2OutPatWmma = !cond(IsC_F32  : (ins i32:$src2_modifiers, Src2VT:$src2),
-                             IsC_F16  : (ins i32:$src2_modifiers, Src2VT:$src2),
-                             IsC_BF16 : (ins (i32 8), Src2VT:$src2),
-                             IsIU     : (ins Src2VT:$src2),
-                             IsSWMMAC : (ins));
-  dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
   dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
                          !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
-                         !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
+                         !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))),
+                         !eq(IndexType, 32): (ins (i64 (SWMMACIndex32 i64:$src2, i32:$index_key_32bit))));
   dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
                           !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
-                          !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
-  dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
-  dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
+                          !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit),
+                          !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit));
+  dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2))));
+  dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1,  (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2));
 
+  dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins));
+  dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins));
 
-  dag WmmaInPat  = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
-  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
+  dag WmmaInPat  = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat);
+  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat);
 
-  dag SwmmacInPat  = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
-  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
+  dag SwmmacInPat  = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat);
+  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat);
 
   // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
   // can't use _twoaddr since it would violate src2 tied to vdst constraint.
-  dag WmmaInlineInPat  = !con(Src0InPat, Src1InPat, Src2InlineInPat,  ClampPat);
-  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
+  dag WmmaInlineInPat  = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat);
+  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat);
 }
 
-multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+def WMMAInstInfoTable : GenericTable {
+  let FilterClass = "WMMAInstInfo";
+  let CppTypeName = "WMMAInstInfo";
+  let Fields = ["Opcode", "is_wmma_xdl"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getWMMAInstInfoHelper";
+}
+
+class WMMAInstInfo {
+  Instruction Opcode = !cast<Instruction>(NAME);
+  bit is_wmma_xdl = 0;
+}
+
+multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix, bit DiffVdstSrc2 = 0> {
+
+  defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
+  defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
+
   let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-    let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
-      def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+    let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
+      def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
         let PseudoInstr = Instr#PseudoInstrSuffix;
       }
 
-    let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
-      def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+    let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in
+      def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
         let PseudoInstr = Instr#PseudoInstrSuffix;
       }
 
@@ -1486,7 +1562,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
 }
 
 multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
-  def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+  def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
     let Mnemonic = Instr;
     let PseudoInstr = Instr#PseudoInstrSuffix;
     let mayRaiseFPException = 0;
@@ -1556,6 +1632,76 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32,   i32, v2i32, v4f32], 1,
 // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
 //                       for matrix A, index is i16; Matrix B uses all lanes
 
+def F64_F64X4_WMMA_w32           : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>;
+def F32_F32_WMMA_w32             : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_BF16X32_WMMA_w32         : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F16_F16X32_WMMA_w32          : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>;
+def BF16_BF16X32_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>;
+def BF16F32_BF16_WMMA_w32        : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>;
+def F32_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>;
+def F32_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>;
+def F16_FP8BF8X64_WMMA_w32       : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>;
+def F16_FP8BF8X128_WMMA_w32      : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>;
+def F32_32X16X128_F4_WMMA_w32    : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>;
+def I32_IU8X64_WMMA_w32          : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>;
+def F32_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>;
+def F32_BF16X64_SWMMAC_w32       : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>;
+def F16_F16X64_SWMMAC_w32        : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>;
+def BF16_BF16X64_SWMMAC_w32      : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>;
+def F32_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8i32,  v16i32, v8f32], 1, 32, 0, 1, 1, 1>;
+def F16_FP8BF8X128_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8i32,  v16i32, v8f16], 1, 32, 0, 1, 1, 1>;
+def I32_IU8X128_SWMMAC_w32       : VOP3PWMMA_Profile<[v8i32, v8i32,  v16i32, v8i32], 1, 32, 1, 0, 1, 1>;
+
+let WaveSizePredicate = isWave32 in {
+let SubtargetPredicate = isGFX125xOnly in {
+defm V_WMMA_F32_16X16X4_F32_w32       : WMMAInstGFX12<"v_wmma_f32_16x16x4_f32",       F32_F32_WMMA_w32, "_w32">;
+
+let is_wmma_xdl = 1 in {
+defm V_WMMA_F32_16X16X32_BF16_w32     : WMMAInstGFX12<"v_wmma_f32_16x16x32_bf16",     F32_BF16X32_WMMA_w32, "_w32">;
+defm V_WMMA_BF16_16X16X32_BF16_w32    : WMMAInstGFX12<"v_wmma_bf16_16x16x32_bf16",    BF16_BF16X32_WMMA_w32, "_w32">;
+defm V_WMMA_BF16F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16f32_16x16x32_bf16", BF16F32_BF16_WMMA_w32, "_w32", 1>;
+defm V_WMMA_F32_16X16X64_FP8_FP8_w32  : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_fp8",  F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_FP8_BF8_w32  : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_bf8",  F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_BF8_FP8_w32  : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_fp8",  F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X64_BF8_BF8_w32  : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_bf8",  F32_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_FP8_FP8_w32  : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_fp8",  F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_FP8_BF8_w32  : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_bf8",  F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_BF8_FP8_w32  : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_fp8",  F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X64_BF8_BF8_w32  : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_bf8",  F16_FP8BF8X64_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X64_IU8_w32      : WMMAInstGFX12<"v_wmma_i32_16x16x64_iu8",      I32_IU8X64_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X32_F16_w32      : WMMAInstGFX12<"v_wmma_f32_16x16x32_f16",      F32_F16X32_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X32_F16_w32      : WMMAInstGFX12<"v_wmma_f16_16x16x32_f16",      F16_F16X32_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">;
+defm V_WMMA_F32_32X16X128_F4_w32      : WMMAInstGFX12<"v_wmma_f32_32x16x128_f4",      F32_32X16X128_F4_WMMA_w32, "_w32">;
+
+defm V_SWMMAC_F32_16X16X64_BF16_w32     : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_bf16",     F32_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16_16X16X64_BF16_w32    : SWMMACInstGFX12<"v_swmmac_bf16_16x16x64_bf16",    BF16_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X128_IU8_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x128_iu8",     I32_IU8X128_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X64_F16_w32      : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16",      F32_F16X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X64_F16_w32      : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16",      F16_F16X64_SWMMAC_w32, "_w32">;
+
+} // End is_wmma_xdl = 1.
+
+} // End SubtargetPredicate = isGFX125xOnly
+} // End WaveSizePredicate = isWave32
+
 let WaveSizePredicate = isWave32 in {
 defm V_WMMA_F32_16X16X16_F16_w32     : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16",     F32_F16_WMMA_w32, "_w32">;
 defm V_WMMA_F32_16X16X16_BF16_w32    : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16",    F32_BF16_WMMA_w32, "_w32">;
@@ -1628,7 +1774,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile
             let WaveSizePredicate = isWave64;
           }
 
-let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in {
   defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w32>;
   defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w32>;
   defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w32,1>;
@@ -1655,7 +1801,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
   def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
 }
 
-let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
   defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w64>;
   defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w64>;
   defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w64,1>;
@@ -1681,6 +1827,49 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
   def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
 }
 
+let WaveSizePredicate = isWave32 in {
+let SubtargetPredicate = isGFX125xOnly in {
+  defm : WMMAPat<"V_WMMA_F32_16X16X4_F32_w32",          int_amdgcn_wmma_f32_16x16x4_f32,          F32_F32_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X32_BF16_w32",        int_amdgcn_wmma_f32_16x16x32_bf16,        F32_BF16X32_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_BF16_16X16X32_BF16_w32",       int_amdgcn_wmma_bf16_16x16x32_bf16,       BF16_BF16X32_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_BF16F32_16X16X32_BF16_w32",    int_amdgcn_wmma_bf16f32_16x16x32_bf16,    BF16F32_BF16_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_FP8_w32",     int_amdgcn_wmma_f32_16x16x64_fp8_fp8,     F32_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_BF8_w32",     int_amdgcn_wmma_f32_16x16x64_fp8_bf8,     F32_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_FP8_w32",     int_amdgcn_wmma_f32_16x16x64_bf8_fp8,     F32_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_BF8_w32",     int_amdgcn_wmma_f32_16x16x64_bf8_bf8,     F32_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_FP8_w32",     int_amdgcn_wmma_f16_16x16x64_fp8_fp8,     F16_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_BF8_w32",     int_amdgcn_wmma_f16_16x16x64_fp8_bf8,     F16_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_FP8_w32",     int_amdgcn_wmma_f16_16x16x64_bf8_fp8,     F16_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_BF8_w32",     int_amdgcn_wmma_f16_16x16x64_bf8_bf8,     F16_FP8BF8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X64_IU8_w32",         int_amdgcn_wmma_i32_16x16x64_iu8,         I32_IU8X64_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X32_F16_w32",         int_amdgcn_wmma_f32_16x16x32_f16,         F32_F16X32_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X32_F16_w32",         int_amdgcn_wmma_f16_16x16x32_f16,         F16_F16X32_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_FP8_w32",    int_amdgcn_wmma_f16_16x16x128_fp8_fp8,    F16_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_BF8_w32",    int_amdgcn_wmma_f16_16x16x128_fp8_bf8,    F16_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_FP8_w32",    int_amdgcn_wmma_f16_16x16x128_bf8_fp8,    F16_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_BF8_w32",    int_amdgcn_wmma_f16_16x16x128_bf8_bf8,    F16_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_FP8_w32",    int_amdgcn_wmma_f32_16x16x128_fp8_fp8,    F32_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_BF8_w32",    int_amdgcn_wmma_f32_16x16x128_fp8_bf8,    F32_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32",    int_amdgcn_wmma_f32_16x16x128_bf8_fp8,    F32_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32",    int_amdgcn_wmma_f32_16x16x128_bf8_bf8,    F32_FP8BF8X128_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32",         int_amdgcn_wmma_f32_32x16x128_f4,         F32_32X16X128_F4_WMMA_w32>;
+
+  def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr,     int_amdgcn_swmmac_f32_16x16x64_bf16,     F32_BF16X64_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr,    int_amdgcn_swmmac_bf16_16x16x64_bf16,    BF16_BF16X64_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr,     int_amdgcn_swmmac_i32_16x16x128_iu8,     I32_IU8X128_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X64_F16_w32_twoaddr,      int_amdgcn_swmmac_f32_16x16x64_f16,      F32_F16X64_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X64_F16_w32_twoaddr,      int_amdgcn_swmmac_f16_16x16x64_f16,      F16_F16X64_SWMMAC_w32>;
+} // End SubtargetPredicate = isGFX125xOnly
+} // End WaveSizePredicate = isWave32
 
 //===----------------------------------------------------------------------===//
 // Begin Real Encodings
@@ -1726,13 +1915,14 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
   // opsel
   let Inst{11} = !cond(!eq(WMMAP.IndexType, 0)  : 0,
                        !eq(WMMAP.IndexType, 8)  : index_key_8bit{0},
-                       !eq(WMMAP.IndexType, 16) : index_key_16bit{0});
+                       !eq(WMMAP.IndexType, 16) : index_key_16bit{0},
+                       !eq(WMMAP.IndexType, 32) : index_key_32bit{0});
   let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
-  let Inst{13} = 0;
+  let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0);
   // opsel_hi
   let Inst{59} = 1;
   let Inst{60} = 1;
-  let Inst{14} = 1;
+  let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1);
   // neg_lo
   let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
   let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
@@ -1742,7 +1932,7 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
   let Inst{9}  = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
   let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
   // clamp
-  let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
+  let Inst{15} = !if(WMMAP.HasClamp, clamp{0}, 0);
 }
 
 multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP,
@@ -1765,6 +1955,12 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
   }
 }
 
+multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> {
+  let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+    defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>;
+  }
+}
+
 defm V_WMMA_F32_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
 defm V_WMMA_F32_16X16X16_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
 defm V_WMMA_F16_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
@@ -1814,6 +2010,46 @@ defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP
 defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
 defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
 
+defm V_WMMA_F32_16X16X4_F32_w32       : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>;
+defm V_WMMA_F32_16X16X32_BF16_w32     : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>;
+defm V_WMMA_F32_16X16X32_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x060, F32_F16X32_WMMA_w32>;
+defm V_WMMA_F16_16X16X32_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x061, F16_F16X32_WMMA_w32>;
+defm V_WMMA_BF16_16X16X32_BF16_w32    : VOP3P_Real_WMMA_gfx1250 <0x063, BF16_BF16X32_WMMA_w32>;
+defm V_WMMA_BF16F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x064, BF16F32_BF16_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_FP8_FP8_w32  : VOP3P_Real_WMMA_gfx1250 <0x06a, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_FP8_BF8_w32  : VOP3P_Real_WMMA_gfx1250 <0x06b, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_BF8_FP8_w32  : VOP3P_Real_WMMA_gfx1250 <0x06c, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X64_BF8_BF8_w32  : VOP3P_Real_WMMA_gfx1250 <0x06d, F32_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_FP8_FP8_w32  : VOP3P_Real_WMMA_gfx1250 <0x06e, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_FP8_BF8_w32  : VOP3P_Real_WMMA_gfx1250 <0x06f, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_BF8_FP8_w32  : VOP3P_Real_WMMA_gfx1250 <0x070, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_F16_16X16X64_BF8_BF8_w32  : VOP3P_Real_WMMA_gfx1250 <0x071, F16_FP8BF8X64_WMMA_w32>;
+defm V_WMMA_I32_16X16X64_IU8_w32      : VOP3P_Real_WMMA_gfx1250 <0x072, I32_IU8X64_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x080, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x081, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x082, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x083, F32_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x084, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x085, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>;
+defm V_WMMA_F32_32X16X128_F4_w32      : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>;
+
+defm V_SWMMAC_F32_16X16X64_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X64_BF16_w32     : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X64_F16_w32      : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X64_BF16_w32    : VOP3P_Real_WMMA_gfx1250 <0x068, BF16_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x069, F32_BF16X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x073, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x074, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x075, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x076, F32_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x077, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x078, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x079, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07a, F16_FP8BF8X128_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X128_IU8_w32     : VOP3P_Real_WMMA_gfx1250 <0x07b, I32_IU8X128_SWMMAC_w32>;
+
 multiclass VOP3P_Real_with_name<GFXGen Gen, bits<8> op,
                           string backing_ps_name = NAME,
                           string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index df215d23f7f40..2b91ea7386be4 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -432,7 +432,7 @@ class VOP3be <VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
-class VOP3Pe <VOPProfile P> : Enc64 {
+class VOP3Pe_Base {
   bits<8> vdst;
   bits<4> src0_modifiers;
   bits<9> src0;
@@ -443,7 +443,12 @@ class VOP3Pe <VOPProfile P> : Enc64 {
   bits<1> clamp;
   bits<2> index_key_8bit;
   bits<1> index_key_16bit;
+  bits<1> index_key_32bit;
+  bits<1> matrix_a_reuse;
+  bits<1> matrix_b_reuse;
+}
 
+class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base {
   let Inst{7-0} = !if(P.HasDst, vdst, 0);
   let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
   let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
@@ -451,9 +456,13 @@ class VOP3Pe <VOPProfile P> : Enc64 {
 
   let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
   let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
-  let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+  let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2},
+                     !if(P.HasMatrixReuse, matrix_a_reuse, 0));    // op_sel(2)
 
-  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
+  let Inst{14} = !cond(!and(P.HasSrc2, P.HasOpSel) : src2_modifiers{3},
+                       P.IsDOT : 1,
+                       P.HasMatrixReuse : matrix_b_reuse,
+                       1: ?); // op_sel_hi(2)
 
   let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
 
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 850b00406f09e..1c42f44765abf 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -2041,12 +2041,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     }
     break;
   }
-  case ARM::TRAPNaCl: {
-    uint32_t Val = 0xe7fedef0UL;
-    OutStreamer->AddComment("trap");
-    ATS.emitInst(Val);
-    return;
-  }
   case ARM::tTRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 52302241fe365..57141ab69223f 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2542,9 +2542,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     }
     case ARM::Int_eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
-      const ARMBaseInstrInfo *AII =
-        static_cast<const ARMBaseInstrInfo*>(TII);
-      const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+      const ARMBaseRegisterInfo &RI = TII->getRegisterInfo();
       // For functions using a base pointer, we rematerialize it (via the frame
       // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
       // for us. Otherwise, expand to nothing.
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index edf68d3c9a715..7ba2487d2390d 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -2635,12 +2635,8 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     return SelectCall(&I, "memset");
   }
   case Intrinsic::trap: {
-    unsigned Opcode;
-    if (Subtarget->isThumb())
-      Opcode = ARM::tTRAP;
-    else
-      Opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opcode));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+            TII.get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
     return true;
   }
   }
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index bb437698296ce..9b1fa5d7b99d8 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -451,12 +451,6 @@ def FeatureVirtualization : SubtargetFeature<"virtualization",
                                              "Supports Virtualization extension",
                                              [FeatureHWDivThumb, FeatureHWDivARM]>;
 
-// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
-// See ARMInstrInfo.td for details.
-// True if NaCl TRAP instruction is generated instead of the regular TRAP.
-def FeatureNaClTrap       : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
-                                             "NaCl trap">;
-
 // True if the subtarget disallows unaligned memory
 // accesses for some types.  For details, see
 // ARMTargetLowering::allowsMisalignedMemoryAccesses().
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 50d8eee8644c1..a8da70eadea5b 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1747,9 +1747,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
          RetOpcode == ARM::TCRETURNrinotr12);
     isInterrupt =
         RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
-    isTrap =
-        RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
-        RetOpcode == ARM::tTRAP;
+    isTrap = RetOpcode == ARM::TRAP || RetOpcode == ARM::tTRAP;
     isCmseEntry = (RetOpcode == ARM::tBXNS || RetOpcode == ARM::tBXNS_RET);
   }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fb72bab03e750..fd3b0525c1056 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
     auto T = const_cast<Type*>(CP->getType());
     auto C = const_cast<Constant*>(CP->getConstVal());
-    auto M = const_cast<Module*>(DAG.getMachineFunction().
-                                 getFunction().getParent());
+    auto M = DAG.getMachineFunction().getFunction().getParent();
     auto GV = new GlobalVariable(
                     *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
                     Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
@@ -11040,13 +11039,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   DispatchBB->setIsEHPad();
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
-  unsigned trap_opcode;
-  if (Subtarget->isThumb())
-    trap_opcode = ARM::tTRAP;
-  else
-    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
 
-  BuildMI(TrapBB, dl, TII->get(trap_opcode));
+  BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
   DispatchBB->addSuccessor(TrapBB);
 
   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
@@ -21590,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -21598,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(Load);
+  if (!LI)
+    return false;
+  assert(!Mask && "Unexpected mask on a load");
+
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c9..9159f3d2c3ed0 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -681,7 +681,7 @@ class VectorType;
 
     unsigned getMaxSupportedInterleaveFactor() const override;
 
-    bool lowerInterleavedLoad(LoadInst *LI,
+    bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 46c776c0fafc4..934ec52c6f1e4 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -2382,29 +2382,13 @@ def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
 /*
  * A5.4 Permanently UNDEFINED instructions.
  *
- * For most targets use UDF #65006, for which the OS will generate SIGTRAP.
- * Other UDF encodings generate SIGILL.
+ * Targets use UDF #65006, for which the OS will generate SIGTRAP.
  *
- * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
- * Encoding A1:
- *  1110 0111 1111 iiii iiii iiii 1111 iiii
- * Encoding T1:
- *  1101 1110 iiii iiii
- * It uses the following encoding:
- *  1110 0111 1111 1110 1101 1110 1111 0000
- *  - In ARM: UDF #60896;
- *  - In Thumb: UDF #254 followed by a branch-to-self.
  */
 let isTrap = 1 in
-def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
-               "trap", [(trap)]>,
-           Requires<[IsARM,UseNaClTrap]> {
-  let Inst = 0xe7fedef0;
-}
-let isTrap = 1 in
 def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
                "trap", [(trap)]>,
-           Requires<[IsARM,DontUseNaClTrap]> {
+           Requires<[IsARM]> {
   let Inst = 0xe7ffdefe;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td
index ddc5ad8754eee..c638e96a355d1 100644
--- a/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/llvm/lib/Target/ARM/ARMPredicates.td
@@ -167,16 +167,12 @@ def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<(all_of (not ModeThumb)), "arm-mode">;
 def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
 def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
-def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
 def IsNotWindows     : Predicate<"!Subtarget->isTargetWindows()">;
 def IsReadTPTPIDRURW : Predicate<"Subtarget->isReadTPTPIDRURW()">;
 def IsReadTPTPIDRURO : Predicate<"Subtarget->isReadTPTPIDRURO()">;
 def IsReadTPTPIDRPRW : Predicate<"Subtarget->isReadTPTPIDRPRW()">;
 def IsReadTPSoft     : Predicate<"Subtarget->isReadTPSoft()">;
-def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
-                                 AssemblerPredicate<(all_of FeatureNaClTrap), "NaCl">;
-def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
 
 def UseNegativeImmediates :
   Predicate<"false">,
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 13185a7d797a3..9f600e0c685ab 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -189,7 +189,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   if (TM.isAAPCS_ABI())
     stackAlignment = Align(8);
-  if (isTargetNaCl() || TM.isAAPCS16_ABI())
+  if (TM.isAAPCS16_ABI())
     stackAlignment = Align(16);
 
   // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
@@ -407,10 +407,9 @@ bool ARMSubtarget::useFastISel() const {
   if (!hasV6Ops())
     return false;
 
-  // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
-  return TM.Options.EnableFastISel &&
-         ((isTargetMachO() && !isThumb1Only()) ||
-          (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
+  // Thumb2 support on iOS; ARM support on iOS and Linux.
+  return TM.Options.EnableFastISel && ((isTargetMachO() && !isThumb1Only()) ||
+                                       (isTargetLinux() && !isThumb()));
 }
 
 unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index beb1ff6447148..637eb4560e0f1 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -338,7 +338,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
   bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
-  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index c66232ef4dc7a..e8d0d35080775 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -166,9 +166,8 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   // Integer registers are 32 bits.
   Ret += "-n32";
 
-  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
-  // aligned everywhere else.
-  if (TT.isOSNaCl() || ABI == ARM::ARM_ABI_AAPCS16)
+  // The stack is 64 bit aligned on AAPCS and 32 bit aligned everywhere else.
+  if (ABI == ARM::ARM_ABI_AAPCS16)
     Ret += "-S128";
   else if (ABI == ARM::ARM_ABI_AAPCS)
     Ret += "-S64";
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index cf84f1043cc69..3692eeeaaa64f 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -16,7 +16,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index ca06b9e3cb661..522c235a90a8f 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -91,9 +91,9 @@ class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
       ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack,
       ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP,
       ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass,
-      ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign,
-      ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9,
-      ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates
+      ARM::FeatureAClass, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
+      ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
+      ARM::FeatureNoNegativeImmediates
   };
 
   const ARMSubtarget *getST() const { return ST; }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 376bddb120d5f..146fc6704c6da 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -215,7 +215,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) {
 
 const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
                                                     uint64_t Value) const {
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   case ARM::fixup_arm_thumb_br: {
     // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
     // low bit being an implied zero. There's an implied +4 offset for the
@@ -311,12 +311,13 @@ static bool needsInterworking(const MCAssembler &Asm, const MCSymbol *Sym,
   return false;
 }
 
-bool ARMAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool ARMAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+                                                 const MCFixup &Fixup,
                                                  const MCValue &Target,
                                                  uint64_t Value,
                                                  bool Resolved) const {
   const MCSymbol *Sym = Target.getAddSym();
-  if (needsInterworking(*Asm, Sym, Fixup.getTargetKind()))
+  if (needsInterworking(*Asm, Sym, Fixup.getKind()))
     return true;
 
   if (!Resolved)
@@ -947,7 +948,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
   }
   // Create relocations for unconditional branches to function symbols with
   // different execution mode in ELF binaries.
-  if (needsInterworking(*Asm, Sym, Fixup.getTargetKind()))
+  if (needsInterworking(*Asm, Sym, Fixup.getKind()))
     return true;
   // We must always generate a relocation for BL/BLX instructions if we have
   // a symbol to reference, as the linker relies on knowing the destination
@@ -1093,7 +1094,7 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F,
   // For a few PC-relative fixups in Thumb mode, offsets need to be aligned
   // down. We compensate here because the default handler's `Value` decrement
   // doesn't account for this alignment.
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   case ARM::fixup_t2_ldst_pcrel_12:
   case ARM::fixup_t2_pcrel_10:
   case ARM::fixup_t2_pcrel_9:
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 877e3afdb1d57..07d2cf784c442 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -51,7 +51,8 @@ class ARMAsmBackend : public MCAsmBackend {
   const char *reasonForFixupRelaxation(const MCFixup &Fixup,
                                        uint64_t Value) const;
 
-  bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+  bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+                                    const MCValue &, uint64_t,
                                     bool) const override;
 
   void relaxInstruction(MCInst &Inst,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index b0ebb74424c78..50e9ca1d3759d 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -75,7 +75,7 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
 unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                           const MCValue &Target,
                                           bool IsPCRel) const {
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
   uint8_t Specifier = Target.getSpecifier();
   auto CheckFDPIC = [&](uint32_t Type) {
     if (getOSABI() != ELF::ELFOSABI_ARM_FDPIC)
@@ -105,7 +105,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
   }
 
   if (IsPCRel) {
-    switch (Fixup.getTargetKind()) {
+    switch (Fixup.getKind()) {
     default:
       reportError(Fixup.getLoc(), "unsupported relocation type");
       return ELF::R_ARM_NONE;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index c61e405bd3a02..eaba6fe5bfcb7 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -638,7 +638,7 @@ class ARMELFStreamer : public MCELFStreamer {
       Offset = 0;
     }
     bool hasInfo() { return F != nullptr; }
-    MCDataFragment *F = nullptr;
+    MCFragment *F = nullptr;
     uint64_t Offset = 0;
     ElfMappingSymbol State = EMS_None;
   };
@@ -650,11 +650,11 @@ class ARMELFStreamer : public MCELFStreamer {
       // This is a tentative symbol, it won't really be emitted until it's
       // actually needed.
       ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
-      auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-      if (!DF)
+      auto *DF = getCurrentFragment();
+      if (DF->getKind() != MCFragment::FT_Data)
         return;
       EMS->F = DF;
-      EMS->Offset = DF->getContents().size();
+      EMS->Offset = DF->getFixedSize();
       LastEMSInfo->State = EMS_Data;
       return;
     }
@@ -686,7 +686,7 @@ class ARMELFStreamer : public MCELFStreamer {
     Symbol->setBinding(ELF::STB_LOCAL);
   }
 
-  void emitMappingSymbol(StringRef Name, MCDataFragment &F, uint64_t Offset) {
+  void emitMappingSymbol(StringRef Name, MCFragment &F, uint64_t Offset) {
     auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name));
     emitLabelAtPos(Symbol, SMLoc(), F, Offset);
     Symbol->setType(ELF::STT_NOTYPE);
@@ -1145,9 +1145,8 @@ void ARMTargetELFStreamer::finish() {
     auto *Text =
         static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
     for (auto &F : *Text)
-      if (auto *DF = dyn_cast<MCDataFragment>(&F))
-        if (!DF->getContents().empty())
-          return;
+      if (F.getSize())
+        return;
     Text->setFlags(Text->getFlags() | ELF::SHF_ARM_PURECODE);
   }
 }
@@ -1208,7 +1207,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
 }
 
 void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
-  MCDataFragment *Frag = getOrCreateDataFragment();
+  MCFragment *Frag = getOrCreateDataFragment();
   Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
 }
 
@@ -1296,7 +1295,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
       MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
 
   visitUsedExpr(*PersonalityRef);
-  MCDataFragment *DF = getOrCreateDataFragment();
+  MCFragment *DF = getOrCreateDataFragment();
   DF->addFixup(
       MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
 }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 2d22b27ceb131..e84aaaad3750d 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -152,12 +152,6 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
     ARMArchFeature += "+thumb-mode,+v4t";
   }
 
-  if (TT.isOSNaCl()) {
-    if (!ARMArchFeature.empty())
-      ARMArchFeature += ",";
-    ARMArchFeature += "+nacl-trap";
-  }
-
   if (TT.isOSWindows()) {
     if (!ARMArchFeature.empty())
       ARMArchFeature += ",";
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index c0c40ade5810a..354de8fd7b4bb 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -192,7 +192,7 @@ void ARMMachObjectWriter::recordARMScatteredHalfRelocation(
   // relocation entry in the low 16 bits of r_address field.
   unsigned ThumbBit = 0;
   unsigned MovtBit = 0;
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default: break;
   case ARM::fixup_arm_movt_hi16:
     MovtBit = 1;
@@ -465,7 +465,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
     // PAIR. I.e. it's correct that we insert the high bits of the addend in the
     // MOVW case here.  relocation entries.
     uint32_t Value = 0;
-    switch (Fixup.getTargetKind()) {
+    switch (Fixup.getKind()) {
     default: break;
     case ARM::fixup_arm_movw_lo16:
     case ARM::fixup_t2_movw_lo16:
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 958790d49d087..dda87537809cf 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -90,7 +90,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
       Data[Fixup.getOffset() + 1] = 0x1;
       support::endian::write32be(&Data[Fixup.getOffset() + 4], Value);
     }
-  } else if (Fixup.getTargetKind() == BPF::FK_BPF_PCRel_4) {
+  } else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) {
     // The input Value represents the number of bytes.
     Value = (uint32_t)((Value - 8) / 8);
     support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value,
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
index ce1da6e58b9cd..694d9eab9694b 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp
@@ -71,7 +71,7 @@ MCFixupKindInfo CSKYAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
                                  MCContext &Ctx) {
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case CSKY::fixup_csky_got32:
@@ -157,7 +157,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
-bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+                                                  const MCFixup &Fixup,
                                                   const MCValue &,
                                                   uint64_t Value,
                                                   bool Resolved) const {
@@ -166,7 +167,7 @@ bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
     return true;
 
   int64_t Offset = int64_t(Value);
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     return false;
   case CSKY::fixup_csky_pcrel_imm10_scale2:
@@ -186,7 +187,7 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F,
   // For a few PC-relative fixups, offsets need to be aligned down. We
   // compensate here because the default handler's `Value` decrement doesn't
   // account for this alignment.
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   case CSKY::fixup_csky_pcrel_uimm16_scale4:
   case CSKY::fixup_csky_pcrel_uimm8_scale4:
   case CSKY::fixup_csky_pcrel_uimm7_scale4:
@@ -264,7 +265,7 @@ bool CSKYAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
                                            const MCValue &Target /*STI*/) {
   if (Target.getSpecifier())
     return true;
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     break;
   case CSKY::fixup_csky_doffset_imm18:
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
index 1d3a22c2bbbb4..1c8516fbf53a7 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h
@@ -38,7 +38,8 @@ class CSKYAsmBackend : public MCAsmBackend {
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
 
-  bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+  bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+                                    const MCValue &, uint64_t,
                                     bool) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count,
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
index 1de82e6cc6ce8..d042d26e6ef21 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp
@@ -39,7 +39,7 @@ unsigned CSKYELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                            bool IsPCRel) const {
   const MCExpr *Expr = Fixup.getValue();
   // Determine the type of the relocation
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
   uint8_t Modifier = Target.getSpecifier();
 
   switch (Target.getSpecifier()) {
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index c97c604fdbf77..d9d9b36d0b739 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -202,7 +202,7 @@ DataScalarizerVisitor::createArrayFromVector(IRBuilder<> &Builder, Value *Vec,
   // original vector's defining instruction if available, else immediately after
   // the alloca
   if (auto *Instr = dyn_cast<Instruction>(Vec))
-    Builder.SetInsertPoint(Instr->getNextNonDebugInstruction());
+    Builder.SetInsertPoint(Instr->getNextNode());
   SmallVector<Value *, 4> GEPs(ArrNumElems);
   for (unsigned I = 0; I < ArrNumElems; ++I) {
     Value *EE = Builder.CreateExtractElement(Vec, I, Name + ".extract");
@@ -302,7 +302,7 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
 
 bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   Value *PtrOperand = GEPI.getPointerOperand();
-  Type *OrigGEPType = GEPI.getPointerOperandType();
+  Type *OrigGEPType = GEPI.getSourceElementType();
   Type *NewGEPType = OrigGEPType;
   bool NeedsTransform = false;
 
@@ -319,6 +319,11 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     }
   }
 
+  // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will
+  // convert these scalar geps into flattened array geps
+  if (!isa<ArrayType>(OrigGEPType))
+    NewGEPType = OrigGEPType;
+
   // Note: We bail if this isn't a gep touched via alloca or global
   // transformations
   if (!NeedsTransform)
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index 0b7cf2f970172..f0e2e786dfaf4 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstddef>
@@ -40,18 +41,19 @@ class DXILFlattenArraysLegacy : public ModulePass {
   static char ID; // Pass identification.
 };
 
-struct GEPData {
-  ArrayType *ParentArrayType;
-  Value *ParentOperand;
-  SmallVector<Value *> Indices;
-  SmallVector<uint64_t> Dims;
-  bool AllIndicesAreConstInt;
+struct GEPInfo {
+  ArrayType *RootFlattenedArrayType;
+  Value *RootPointerOperand;
+  SmallMapVector<Value *, APInt, 4> VariableOffsets;
+  APInt ConstantOffset;
 };
 
 class DXILFlattenArraysVisitor
     : public InstVisitor<DXILFlattenArraysVisitor, bool> {
 public:
-  DXILFlattenArraysVisitor() {}
+  DXILFlattenArraysVisitor(
+      SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap)
+      : GlobalMap(GlobalMap) {}
   bool visit(Function &F);
   // InstVisitor methods.  They return true if the instruction was scalarized,
   // false if nothing changed.
@@ -78,7 +80,8 @@ class DXILFlattenArraysVisitor
 
 private:
   SmallVector<WeakTrackingVH> PotentiallyDeadInstrs;
-  DenseMap<GetElementPtrInst *, GEPData> GEPChainMap;
+  SmallDenseMap<GEPOperator *, GEPInfo> GEPChainInfoMap;
+  SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap;
   bool finish();
   ConstantInt *genConstFlattenIndices(ArrayRef<Value *> Indices,
                                       ArrayRef<uint64_t> Dims,
@@ -86,27 +89,11 @@ class DXILFlattenArraysVisitor
   Value *genInstructionFlattenIndices(ArrayRef<Value *> Indices,
                                       ArrayRef<uint64_t> Dims,
                                       IRBuilder<> &Builder);
-
-  // Helper function to collect indices and dimensions from a GEP instruction
-  void collectIndicesAndDimsFromGEP(GetElementPtrInst &GEP,
-                                    SmallVectorImpl<Value *> &Indices,
-                                    SmallVectorImpl<uint64_t> &Dims,
-                                    bool &AllIndicesAreConstInt);
-
-  void
-  recursivelyCollectGEPs(GetElementPtrInst &CurrGEP,
-                         ArrayType *FlattenedArrayType, Value *PtrOperand,
-                         unsigned &GEPChainUseCount,
-                         SmallVector<Value *> Indices = SmallVector<Value *>(),
-                         SmallVector<uint64_t> Dims = SmallVector<uint64_t>(),
-                         bool AllIndicesAreConstInt = true);
-  bool visitGetElementPtrInstInGEPChain(GetElementPtrInst &GEP);
-  bool visitGetElementPtrInstInGEPChainBase(GEPData &GEPInfo,
-                                            GetElementPtrInst &GEP);
 };
 } // namespace
 
 bool DXILFlattenArraysVisitor::finish() {
+  GEPChainInfoMap.clear();
   RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
   return true;
 }
@@ -225,131 +212,159 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) {
   return true;
 }
 
-void DXILFlattenArraysVisitor::collectIndicesAndDimsFromGEP(
-    GetElementPtrInst &GEP, SmallVectorImpl<Value *> &Indices,
-    SmallVectorImpl<uint64_t> &Dims, bool &AllIndicesAreConstInt) {
-
-  Type *CurrentType = GEP.getSourceElementType();
-
-  // Note index 0 is the ptr index.
-  for (Value *Index : llvm::drop_begin(GEP.indices(), 1)) {
-    Indices.push_back(Index);
-    AllIndicesAreConstInt &= isa<ConstantInt>(Index);
+bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  // Do not visit GEPs more than once
+  if (GEPChainInfoMap.contains(cast<GEPOperator>(&GEP)))
+    return false;
 
-    if (auto *ArrayTy = dyn_cast<ArrayType>(CurrentType)) {
-      Dims.push_back(ArrayTy->getNumElements());
-      CurrentType = ArrayTy->getElementType();
-    } else {
-      assert(false && "Expected array type in GEP chain");
-    }
+  Value *PtrOperand = GEP.getPointerOperand();
+  // It shouldn't(?) be possible for the pointer operand of a GEP to be a PHI
+  // node unless HLSL has pointers. If this assumption is incorrect or HLSL gets
+  // pointer types, then the handling of this case can be implemented later.
+  assert(!isa<PHINode>(PtrOperand) &&
+         "Pointer operand of GEP should not be a PHI Node");
+
+  // Replace a GEP ConstantExpr pointer operand with a GEP instruction so that
+  // it can be visited
+  if (auto *PtrOpGEPCE = dyn_cast<ConstantExpr>(PtrOperand);
+      PtrOpGEPCE && PtrOpGEPCE->getOpcode() == Instruction::GetElementPtr) {
+    GetElementPtrInst *OldGEPI =
+        cast<GetElementPtrInst>(PtrOpGEPCE->getAsInstruction());
+    OldGEPI->insertBefore(GEP.getIterator());
+
+    IRBuilder<> Builder(&GEP);
+    SmallVector<Value *> Indices(GEP.indices());
+    Value *NewGEP =
+        Builder.CreateGEP(GEP.getSourceElementType(), OldGEPI, Indices,
+                          GEP.getName(), GEP.getNoWrapFlags());
+    assert(isa<GetElementPtrInst>(NewGEP) &&
+           "Expected newly-created GEP to be an instruction");
+    GetElementPtrInst *NewGEPI = cast<GetElementPtrInst>(NewGEP);
+
+    GEP.replaceAllUsesWith(NewGEPI);
+    GEP.eraseFromParent();
+    visitGetElementPtrInst(*OldGEPI);
+    visitGetElementPtrInst(*NewGEPI);
+    return true;
   }
-}
-
-void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
-    GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType,
-    Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices,
-    SmallVector<uint64_t> Dims, bool AllIndicesAreConstInt) {
-  // Check if this GEP is already in the map to avoid circular references
-  if (GEPChainMap.count(&CurrGEP) > 0)
-    return;
 
-  // Collect indices and dimensions from the current GEP
-  collectIndicesAndDimsFromGEP(CurrGEP, Indices, Dims, AllIndicesAreConstInt);
-  bool IsMultiDimArr = isMultiDimensionalArray(CurrGEP.getSourceElementType());
-  if (!IsMultiDimArr) {
-    assert(GEPChainUseCount < FlattenedArrayType->getNumElements());
-    GEPChainMap.insert(
-        {&CurrGEP,
-         {std::move(FlattenedArrayType), PtrOperand, std::move(Indices),
-          std::move(Dims), AllIndicesAreConstInt}});
-    return;
-  }
-  bool GepUses = false;
-  for (auto *User : CurrGEP.users()) {
-    if (GetElementPtrInst *NestedGEP = dyn_cast<GetElementPtrInst>(User)) {
-      recursivelyCollectGEPs(*NestedGEP, FlattenedArrayType, PtrOperand,
-                             ++GEPChainUseCount, Indices, Dims,
-                             AllIndicesAreConstInt);
-      GepUses = true;
-    }
-  }
-  // This case is just incase the gep chain doesn't end with a 1d array.
-  if (IsMultiDimArr && GEPChainUseCount > 0 && !GepUses) {
-    GEPChainMap.insert(
-        {&CurrGEP,
-         {std::move(FlattenedArrayType), PtrOperand, std::move(Indices),
-          std::move(Dims), AllIndicesAreConstInt}});
+  // Construct GEPInfo for this GEP
+  GEPInfo Info;
+
+  // Obtain the variable and constant byte offsets computed by this GEP
+  const DataLayout &DL = GEP.getDataLayout();
+  unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP.getType());
+  Info.ConstantOffset = {BitWidth, 0};
+  [[maybe_unused]] bool Success = GEP.collectOffset(
+      DL, BitWidth, Info.VariableOffsets, Info.ConstantOffset);
+  assert(Success && "Failed to collect offsets for GEP");
+
+  // If there is a parent GEP, inherit the root array type and pointer, and
+  // merge the byte offsets. Otherwise, this GEP is itself the root of a GEP
+  // chain and we need to deterine the root array type
+  if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) {
+    assert(GEPChainInfoMap.contains(PtrOpGEP) &&
+           "Expected parent GEP to be visited before this GEP");
+    GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP];
+    Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType;
+    Info.RootPointerOperand = PGEPInfo.RootPointerOperand;
+    for (auto &VariableOffset : PGEPInfo.VariableOffsets)
+      Info.VariableOffsets.insert(VariableOffset);
+    Info.ConstantOffset += PGEPInfo.ConstantOffset;
+  } else {
+    Info.RootPointerOperand = PtrOperand;
+
+    // We should try to determine the type of the root from the pointer rather
+    // than the GEP's source element type because this could be a scalar GEP
+    // into an array-typed pointer from an Alloca or Global Variable.
+    Type *RootTy = GEP.getSourceElementType();
+    if (auto *GlobalVar = dyn_cast<GlobalVariable>(PtrOperand)) {
+      if (GlobalMap.contains(GlobalVar))
+        GlobalVar = GlobalMap[GlobalVar];
+      Info.RootPointerOperand = GlobalVar;
+      RootTy = GlobalVar->getValueType();
+    } else if (auto *Alloca = dyn_cast<AllocaInst>(PtrOperand))
+      RootTy = Alloca->getAllocatedType();
+    assert(!isMultiDimensionalArray(RootTy) &&
+           "Expected root array type to be flattened");
+
+    // If the root type is not an array, we don't need to do any flattening
+    if (!isa<ArrayType>(RootTy))
+      return false;
+
+    Info.RootFlattenedArrayType = cast<ArrayType>(RootTy);
   }
-}
 
-bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChain(
-    GetElementPtrInst &GEP) {
-  GEPData GEPInfo = GEPChainMap.at(&GEP);
-  return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP);
-}
-bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChainBase(
-    GEPData &GEPInfo, GetElementPtrInst &GEP) {
-  IRBuilder<> Builder(&GEP);
-  Value *FlatIndex;
-  if (GEPInfo.AllIndicesAreConstInt)
-    FlatIndex = genConstFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder);
-  else
-    FlatIndex =
-        genInstructionFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder);
-
-  ArrayType *FlattenedArrayType = GEPInfo.ParentArrayType;
-
-  // Don't append '.flat' to an empty string. If the SSA name isn't available
-  // it could conflict with the ParentOperand's name.
-  std::string FlatName = GEP.hasName() ? GEP.getName().str() + ".flat" : "";
-
-  Value *FlatGEP = Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParentOperand,
-                                     {Builder.getInt32(0), FlatIndex}, FlatName,
-                                     GEP.getNoWrapFlags());
-
-  // Note: Old gep will become an invalid instruction after replaceAllUsesWith.
-  // Erase the old GEP in the map before to avoid invalid instructions
-  // and circular references.
-  GEPChainMap.erase(&GEP);
-
-  GEP.replaceAllUsesWith(FlatGEP);
-  GEP.eraseFromParent();
-  return true;
-}
-
-bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
-  auto It = GEPChainMap.find(&GEP);
-  if (It != GEPChainMap.end())
-    return visitGetElementPtrInstInGEPChain(GEP);
-  if (!isMultiDimensionalArray(GEP.getSourceElementType()))
-    return false;
-
-  ArrayType *ArrType = cast<ArrayType>(GEP.getSourceElementType());
-  IRBuilder<> Builder(&GEP);
-  auto [TotalElements, BaseType] = getElementCountAndType(ArrType);
-  ArrayType *FlattenedArrayType = ArrayType::get(BaseType, TotalElements);
-
-  Value *PtrOperand = GEP.getPointerOperand();
+  // GEPs without users or GEPs with non-GEP users should be replaced such that
+  // the chain of GEPs they are a part of are collapsed to a single GEP into a
+  // flattened array.
+  bool ReplaceThisGEP = GEP.users().empty();
+  for (Value *User : GEP.users())
+    if (!isa<GetElementPtrInst>(User))
+      ReplaceThisGEP = true;
+
+  if (ReplaceThisGEP) {
+    unsigned BytesPerElem =
+        DL.getTypeAllocSize(Info.RootFlattenedArrayType->getArrayElementType());
+    assert(isPowerOf2_32(BytesPerElem) &&
+           "Bytes per element should be a power of 2");
+
+    // Compute the 32-bit index for this flattened GEP from the constant and
+    // variable byte offsets in the GEPInfo
+    IRBuilder<> Builder(&GEP);
+    Value *ZeroIndex = Builder.getInt32(0);
+    uint64_t ConstantOffset =
+        Info.ConstantOffset.udiv(BytesPerElem).getZExtValue();
+    assert(ConstantOffset < UINT32_MAX &&
+           "Constant byte offset for flat GEP index must fit within 32 bits");
+    Value *FlattenedIndex = Builder.getInt32(ConstantOffset);
+    for (auto [VarIndex, Multiplier] : Info.VariableOffsets) {
+      assert(Multiplier.getActiveBits() <= 32 &&
+             "The multiplier for a flat GEP index must fit within 32 bits");
+      assert(VarIndex->getType()->isIntegerTy(32) &&
+             "Expected i32-typed GEP indices");
+      Value *VI;
+      if (Multiplier.getZExtValue() % BytesPerElem != 0) {
+        // This can happen, e.g., with i8 GEPs. To handle this we just divide
+        // by BytesPerElem using an instruction after multiplying VarIndex by
+        // Multiplier.
+        VI = Builder.CreateMul(VarIndex,
+                               Builder.getInt32(Multiplier.getZExtValue()));
+        VI = Builder.CreateLShr(VI, Builder.getInt32(Log2_32(BytesPerElem)));
+      } else
+        VI = Builder.CreateMul(
+            VarIndex,
+            Builder.getInt32(Multiplier.getZExtValue() / BytesPerElem));
+      FlattenedIndex = Builder.CreateAdd(FlattenedIndex, VI);
+    }
 
-  unsigned GEPChainUseCount = 0;
-  recursivelyCollectGEPs(GEP, FlattenedArrayType, PtrOperand, GEPChainUseCount);
-
-  // NOTE: hasNUses(0) is not the same as GEPChainUseCount == 0.
-  // Here recursion is used to get the length of the GEP chain.
-  // Handle zero uses here because there won't be an update via
-  // a child in the chain later.
-  if (GEPChainUseCount == 0) {
-    SmallVector<Value *> Indices;
-    SmallVector<uint64_t> Dims;
-    bool AllIndicesAreConstInt = true;
-
-    // Collect indices and dimensions from the GEP
-    collectIndicesAndDimsFromGEP(GEP, Indices, Dims, AllIndicesAreConstInt);
-    GEPData GEPInfo{std::move(FlattenedArrayType), PtrOperand,
-                    std::move(Indices), std::move(Dims), AllIndicesAreConstInt};
-    return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP);
+    // Construct a new GEP for the flattened array to replace the current GEP
+    Value *NewGEP = Builder.CreateGEP(
+        Info.RootFlattenedArrayType, Info.RootPointerOperand,
+        {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags());
+
+    // If the pointer operand is a global variable and all indices are 0,
+    // IRBuilder::CreateGEP will return the global variable instead of creating
+    // a GEP instruction or GEP ConstantExpr. In this case we have to create and
+    // insert our own GEP instruction.
+    if (!isa<GEPOperator>(NewGEP))
+      NewGEP = GetElementPtrInst::Create(
+          Info.RootFlattenedArrayType, Info.RootPointerOperand,
+          {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(),
+          Builder.GetInsertPoint());
+
+    // Replace the current GEP with the new GEP. Store GEPInfo into the map
+    // for later use in case this GEP was not the end of the chain
+    GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)});
+    GEP.replaceAllUsesWith(NewGEP);
+    GEP.eraseFromParent();
+    return true;
   }
 
+  // This GEP is potentially dead at the end of the pass since it may not have
+  // any users anymore after GEP chains have been collapsed. We retain store
+  // GEPInfo for GEPs down the chain to use to compute their indices.
+  GEPChainInfoMap.insert({cast<GEPOperator>(&GEP), std::move(Info)});
   PotentiallyDeadInstrs.emplace_back(&GEP);
   return false;
 }
@@ -416,9 +431,8 @@ static Constant *transformInitializer(Constant *Init, Type *OrigType,
   return ConstantArray::get(FlattenedType, FlattenedElements);
 }
 
-static void
-flattenGlobalArrays(Module &M,
-                    DenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap) {
+static void flattenGlobalArrays(
+    Module &M, SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap) {
   LLVMContext &Ctx = M.getContext();
   for (GlobalVariable &G : M.globals()) {
     Type *OrigType = G.getValueType();
@@ -456,9 +470,9 @@ flattenGlobalArrays(Module &M,
 
 static bool flattenArrays(Module &M) {
   bool MadeChange = false;
-  DXILFlattenArraysVisitor Impl;
-  DenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
+  SmallDenseMap<GlobalVariable *, GlobalVariable *> GlobalMap;
   flattenGlobalArrays(M, GlobalMap);
+  DXILFlattenArraysVisitor Impl(GlobalMap);
   for (auto &F : make_early_inc_range(M.functions())) {
     if (F.isDeclaration())
       continue;
diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
index 76a46c7a2b760..c73648f21e8d7 100644
--- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
+++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp
@@ -98,9 +98,9 @@ static void fixI8UseChain(Instruction &I,
       ElementType = AI->getAllocatedType();
     if (auto *GEP = dyn_cast<GetElementPtrInst>(NewOperands[0])) {
       ElementType = GEP->getSourceElementType();
-      if (ElementType->isArrayTy())
-        ElementType = ElementType->getArrayElementType();
     }
+    if (ElementType->isArrayTy())
+      ElementType = ElementType->getArrayElementType();
     LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]);
     ReplacedValues[Load] = NewLoad;
     ToRemove.push_back(Load);
@@ -347,7 +347,6 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
   if (ByteLength == 0)
     return;
 
-  LLVMContext &Ctx = Builder.getContext();
   const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
 
   auto GetArrTyFromVal = [](Value *Val) -> ArrayType * {
@@ -392,10 +391,11 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
   assert(ByteLength % DstElemByteSize == 0 &&
          "memcpy length must be divisible by array element type");
   for (uint64_t I = 0; I < NumElemsToCopy; ++I) {
-    Value *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), I);
-    Value *SrcPtr = Builder.CreateInBoundsGEP(SrcElemTy, Src, Offset, "gep");
+    SmallVector<Value *, 2> Indices = {Builder.getInt32(0),
+                                       Builder.getInt32(I)};
+    Value *SrcPtr = Builder.CreateInBoundsGEP(SrcArrTy, Src, Indices, "gep");
     Value *SrcVal = Builder.CreateLoad(SrcElemTy, SrcPtr);
-    Value *DstPtr = Builder.CreateInBoundsGEP(DstElemTy, Dst, Offset, "gep");
+    Value *DstPtr = Builder.CreateInBoundsGEP(DstArrTy, Dst, Indices, "gep");
     Builder.CreateStore(SrcVal, DstPtr);
   }
 }
@@ -403,7 +403,6 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src,
 static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val,
                                 ConstantInt *SizeCI,
                                 DenseMap<Value *, Value *> &ReplacedValues) {
-  LLVMContext &Ctx = Builder.getContext();
   [[maybe_unused]] const DataLayout &DL =
       Builder.GetInsertBlock()->getModule()->getDataLayout();
   [[maybe_unused]] uint64_t OrigSize = SizeCI->getZExtValue();
@@ -444,8 +443,9 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val,
   }
 
   for (uint64_t I = 0; I < Size; ++I) {
-    Value *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), I);
-    Value *Ptr = Builder.CreateGEP(ElemTy, Dst, Offset, "gep");
+    Value *Zero = Builder.getInt32(0);
+    Value *Offset = Builder.getInt32(I);
+    Value *Ptr = Builder.CreateGEP(ArrTy, Dst, {Zero, Offset}, "gep");
     Builder.CreateStore(TypedVal, Ptr);
   }
 }
@@ -478,9 +478,9 @@ static void legalizeMemCpy(Instruction &I,
   ToRemove.push_back(CI);
 }
 
-static void removeMemSet(Instruction &I,
-                         SmallVectorImpl<Instruction *> &ToRemove,
-                         DenseMap<Value *, Value *> &ReplacedValues) {
+static void legalizeMemSet(Instruction &I,
+                           SmallVectorImpl<Instruction *> &ToRemove,
+                           DenseMap<Value *, Value *> &ReplacedValues) {
 
   CallInst *CI = dyn_cast<CallInst>(&I);
   if (!CI)
@@ -562,6 +562,53 @@ legalizeGetHighLowi64Bytes(Instruction &I,
   }
 }
 
+static void
+legalizeScalarLoadStoreOnArrays(Instruction &I,
+                                SmallVectorImpl<Instruction *> &ToRemove,
+                                DenseMap<Value *, Value *> &) {
+
+  Value *PtrOp;
+  unsigned PtrOpIndex;
+  [[maybe_unused]] Type *LoadStoreTy;
+  if (auto *LI = dyn_cast<LoadInst>(&I)) {
+    PtrOp = LI->getPointerOperand();
+    PtrOpIndex = LI->getPointerOperandIndex();
+    LoadStoreTy = LI->getType();
+  } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+    PtrOp = SI->getPointerOperand();
+    PtrOpIndex = SI->getPointerOperandIndex();
+    LoadStoreTy = SI->getValueOperand()->getType();
+  } else
+    return;
+
+  // If the load/store is not of a single-value type (i.e., scalar or vector)
+  // then we do not modify it. It shouldn't be a vector either because the
+  // dxil-data-scalarization pass is expected to run before this, but it's not
+  // incorrect to apply this transformation to vector load/stores.
+  if (!LoadStoreTy->isSingleValueType())
+    return;
+
+  Type *ArrayTy;
+  if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp))
+    ArrayTy = GlobalVarPtrOp->getValueType();
+  else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp))
+    ArrayTy = AllocaPtrOp->getAllocatedType();
+  else
+    return;
+
+  if (!isa<ArrayType>(ArrayTy))
+    return;
+
+  assert(ArrayTy->getArrayElementType() == LoadStoreTy &&
+         "Expected array element type to be the same as to the scalar load or "
+         "store type");
+
+  Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0);
+  Value *GEP = GetElementPtrInst::Create(
+      ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator());
+  I.setOperand(PtrOpIndex, GEP);
+}
+
 namespace {
 class DXILLegalizationPipeline {
 
@@ -603,7 +650,7 @@ class DXILLegalizationPipeline {
     LegalizationPipeline[Stage1].push_back(legalizeGetHighLowi64Bytes);
     LegalizationPipeline[Stage1].push_back(legalizeFreeze);
     LegalizationPipeline[Stage1].push_back(legalizeMemCpy);
-    LegalizationPipeline[Stage1].push_back(removeMemSet);
+    LegalizationPipeline[Stage1].push_back(legalizeMemSet);
     LegalizationPipeline[Stage1].push_back(updateFnegToFsub);
     // Note: legalizeGetHighLowi64Bytes and
     // downcastI64toI32InsertExtractElements both modify extractelement, so they
@@ -612,6 +659,7 @@ class DXILLegalizationPipeline {
     // downcastI64toI32InsertExtractElements needs to handle.
     LegalizationPipeline[Stage2].push_back(
         downcastI64toI32InsertExtractElements);
+    LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays);
   }
 };
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 40fe6c6e639e4..84751d2db2266 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -107,10 +107,10 @@ class DirectXPassConfig : public TargetPassConfig {
     addPass(createDXILIntrinsicExpansionLegacyPass());
     addPass(createDXILCBufferAccessLegacyPass());
     addPass(createDXILDataScalarizationLegacyPass());
-    addPass(createDXILFlattenArraysLegacyPass());
     ScalarizerPassOptions DxilScalarOptions;
     DxilScalarOptions.ScalarizeLoadStore = true;
     addPass(createScalarizerPass(DxilScalarOptions));
+    addPass(createDXILFlattenArraysLegacyPass());
     addPass(createDXILForwardHandleAccessesLegacyPass());
     addPass(createDXILLegalizeLegacyPass());
     addPass(createDXILResourceImplicitBindingLegacyPass());
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index bb7814c5226fd..35da34ed0a899 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -1005,7 +1005,7 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) {
       SmallVector<MachineBasicBlock*,2> ToRemove;
       for (MachineBasicBlock *SB : B->successors()) {
         if (!Targets.count(SB))
-          ToRemove.push_back(const_cast<MachineBasicBlock*>(SB));
+          ToRemove.push_back(SB);
         Targets.remove(SB);
       }
       for (MachineBasicBlock *MBB : ToRemove)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 53943de3bc597..e285e04543694 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
       R = N;
       break;
     }
+    case ISD::AssertSext: {
+      EVT T = cast<VTSDNode>(N.getOperand(1))->getVT();
+      if (T.getSizeInBits() == 32)
+        R = N.getOperand(0);
+      else
+        return false;
+      break;
+    }
+
     default:
       return false;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index ec73e58ce5d44..facea646d4b68 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -236,7 +236,16 @@ MVT HexagonTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
 SDValue
 HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
       const {
-  return SDValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
+  SDLoc dl(Op);
+  switch (IntNo) {
+  default:
+    return SDValue(); // Don't custom lower most intrinsics.
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getNode(HexagonISD::THREAD_POINTER, dl, PtrVT);
+  }
+  }
 }
 
 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
@@ -1588,6 +1597,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::PREFETCH,             MVT::Other, Custom);
   setOperationAction(ISD::READCYCLECOUNTER,     MVT::i64,   Custom);
   setOperationAction(ISD::READSTEADYCOUNTER,    MVT::i64,   Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID,       MVT::Other, Custom);
   setOperationAction(ISD::EH_RETURN,            MVT::Other, Custom);
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE,  MVT::i32,   Custom);
@@ -1963,6 +1973,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VROR:          return "HexagonISD::VROR";
   case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::READTIMER:     return "HexagonISD::READTIMER";
+  case HexagonISD::THREAD_POINTER:
+    return "HexagonISD::THREAD_POINTER";
   case HexagonISD::PTRUE:         return "HexagonISD::PTRUE";
   case HexagonISD::PFALSE:        return "HexagonISD::PFALSE";
   case HexagonISD::D2P:           return "HexagonISD::D2P";
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index f9e5478f457f8..9ebbbc6399b42 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -31,6 +31,7 @@ namespace llvm {
 
 namespace HexagonISD {
 
+// clang-format off
 enum NodeType : unsigned {
   OP_BEGIN = ISD::BUILTIN_OP_END,
 
@@ -78,6 +79,7 @@ enum NodeType : unsigned {
   DCFETCH,
   READCYCLE,
   READTIMER,
+  THREAD_POINTER,
   PTRUE,
   PFALSE,
   D2P,         // Convert 8-byte value to 8-bit predicate register. [*]
@@ -121,6 +123,7 @@ enum NodeType : unsigned {
 };
 
 } // end namespace HexagonISD
+// clang-format on
 
 class HexagonSubtarget;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 82d999ad820ed..4b236708ca6d5 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -3432,6 +3432,11 @@ def HexagonREADTIMER: SDNode<"HexagonISD::READTIMER", SDTInt64Leaf,
 
 def: Pat<(HexagonREADTIMER), (A4_tfrcpp UTIMER)>;
 
+def SDTInt32Leaf : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def HexagonTHREADPOINTER : SDNode<"HexagonISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def : Pat<(HexagonTHREADPOINTER), (i32(COPY UGP))>;
+
 // The declared return value of the store-locked intrinsics is i32, but
 // the instructions actually define i1. To avoid register copies from
 // IntRegs to PredRegs and back, fold the entire pattern checking the
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index de7bd5d4b2c66..7d3074ba6b5d2 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -46,16 +46,15 @@ class HexagonAsmBackend : public MCAsmBackend {
   MCInst * Extender;
   unsigned MaxPacketSize;
 
-  void ReplaceInstruction(MCCodeEmitter &E, MCRelaxableFragment &RF,
-                          MCInst &HMB) const {
+  void ReplaceInstruction(MCCodeEmitter &E, MCFragment &RF, MCInst &HMB) const {
     SmallVector<MCFixup, 4> Fixups;
     SmallString<256> Code;
     E.encodeInstruction(HMB, Code, Fixups, *RF.getSubtargetInfo());
 
     // Update the fragment.
     RF.setInst(HMB);
-    RF.setContents(Code);
-    RF.getFixups() = Fixups;
+    RF.setVarContents(Code);
+    RF.setVarFixups(Fixups);
   }
 
 public:
@@ -200,7 +199,7 @@ class HexagonAsmBackend : public MCAsmBackend {
   }
 
   bool shouldForceRelocation(const MCFixup &Fixup) {
-    switch(Fixup.getTargetKind()) {
+    switch(Fixup.getKind()) {
       default:
         llvm_unreachable("Unknown Fixup Kind!");
 
@@ -438,21 +437,21 @@ class HexagonAsmBackend : public MCAsmBackend {
 
   /// fixupNeedsRelaxation - Target specific predicate for whether a given
   /// fixup requires the associated instruction to be relaxed.
-  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, const MCValue &,
-                                    uint64_t Value,
+  bool fixupNeedsRelaxationAdvanced(const MCFragment &F, const MCFixup &Fixup,
+                                    const MCValue &, uint64_t Value,
                                     bool Resolved) const override {
     MCInst const &MCB = RelaxedMCB;
     assert(HexagonMCInstrInfo::isBundle(MCB));
 
     *RelaxTarget = nullptr;
     MCInst &MCI = const_cast<MCInst &>(HexagonMCInstrInfo::instruction(
-        MCB, Fixup.getOffset() / HEXAGON_INSTR_SIZE));
+        MCB, (Fixup.getOffset() - F.getFixedSize()) / HEXAGON_INSTR_SIZE));
     bool Relaxable = isInstRelaxable(MCI);
     if (Relaxable == false)
       return false;
     // If we cannot resolve the fixup value, it requires relaxation.
     if (!Resolved) {
-      switch (Fixup.getTargetKind()) {
+      switch (Fixup.getKind()) {
       case fixup_Hexagon_B22_PCREL:
         // GetFixupCount assumes B22 won't relax
         [[fallthrough]];
@@ -595,7 +594,7 @@ class HexagonAsmBackend : public MCAsmBackend {
             }
             case MCFragment::FT_Relaxable: {
               MCContext &Context = getContext();
-              auto &RF = cast<MCRelaxableFragment>(*Frags[K]);
+              auto &RF = *Frags[K];
               MCInst Inst = RF.getInst();
 
               const bool WouldTraverseLabel = llvm::any_of(
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index ed381c33225d2..9752f3a131205 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -56,7 +56,7 @@ unsigned HexagonELFObjectWriter::getRelocType(const MCFixup &Fixup,
   default:
     break;
   }
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     report_fatal_error("Unrecognized relocation type");
     break;
diff --git a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
index 93beaec7eeff3..3c3924bd50182 100644
--- a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -59,8 +59,7 @@ void LanaiFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 // ADJDYNALLOC pseudo instructions with a Lanai:ADDI with the
 // maximum call frame size as the immediate.
 void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
-  const LanaiInstrInfo &LII =
-      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  const LanaiInstrInfo &LII = *STI.getInstrInfo();
   unsigned MaxCallFrameSize = MF.getFrameInfo().getMaxCallFrameSize();
 
   for (MachineBasicBlock &MBB : MF) {
@@ -88,8 +87,7 @@ void LanaiFrameLowering::emitPrologue(MachineFunction &MF,
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const LanaiInstrInfo &LII =
-      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  const LanaiInstrInfo &LII = *STI.getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
 
   // Debug location must be unknown since the first debug location is used
@@ -173,8 +171,7 @@ MachineBasicBlock::iterator LanaiFrameLowering::eliminateCallFramePseudoInstr(
 void LanaiFrameLowering::emitEpilogue(MachineFunction & /*MF*/,
                                       MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  const LanaiInstrInfo &LII =
-      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  const LanaiInstrInfo &LII = *STI.getInstrInfo();
   DebugLoc DL = MBBI->getDebugLoc();
 
   // Restore the stack pointer using the callee's frame pointer value.
@@ -195,8 +192,7 @@ void LanaiFrameLowering::determineCalleeSaves(MachineFunction &MF,
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const LanaiRegisterInfo *LRI =
-      static_cast<const LanaiRegisterInfo *>(STI.getRegisterInfo());
+  const LanaiRegisterInfo *LRI = STI.getRegisterInfo();
   int Offset = -4;
 
   // Reserve 4 bytes for the saved RCA
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index d5a5f17348e4b..36c3011be2b9e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the baisc single-precision floating-point instructions.
+// This file describes the basic single-precision floating-point instructions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index ac5e7f3891c72..1493bf4cba695 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
   // estimateStackSize has been observed to under-estimate the final stack
   // size, so give ourselves wiggle-room by checking for stack size
   // representable an 11-bit signed field rather than 12-bits.
-  if (!isInt<11>(MFI.estimateStackSize(MF)))
+  // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit
+  // signed field is fine.
+  unsigned EstimateStackSize = MFI.estimateStackSize(MF);
+  if (!isInt<11>(EstimateStackSize) ||
+      (MF.getSubtarget<LoongArchSubtarget>().hasExtLSX() &&
+       !isInt<7>(EstimateStackSize)))
     ScavSlotsNum = std::max(ScavSlotsNum, 1u);
 
   // For CFR spill.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index c47987fbf683b..2378664ca8155 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2597,12 +2597,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT VecTy = Op->getOperand(0)->getValueType(0);
   SDValue Idx = Op->getOperand(1);
-  EVT EltTy = VecTy.getVectorElementType();
   unsigned NumElts = VecTy.getVectorNumElements();
 
-  if (isa<ConstantSDNode>(Idx) &&
-      (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
-       EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2))
+  if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
     return Op;
 
   return SDValue();
@@ -6003,10 +6000,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
   Register ScratchReg1 = XSrc;
   if (Idx >= HalfSize) {
     ScratchReg1 = MRI.createVirtualRegister(RC);
-    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1)
+    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1)
         .addReg(XSrc)
-        .addReg(XSrc)
-        .addImm(1);
+        .addImm(14);
   }
 
   Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 95e9fd49d1c0d..a0107e44b421b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> {
             (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
 }
 
+multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...3 in {
+    foreach imm2 = 0...3 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert (vector_insert vecty:$xd,
+                    (elemty (vector_extract vecty:$xj, imm1)), imm2),
+                    (elemty (vector_extract vecty:$xj, !add(imm1, 4))),
+                    !add(imm2, 4)),
+                (XVEXTRINS_W $xd, $xj, Imm)>;
+    }
+  }
+}
+
+multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...1 in {
+    foreach imm2 = 0...1 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert (vector_insert vecty:$xd,
+                    (elemty (vector_extract vecty:$xj, imm1)), imm2),
+                    (elemty (vector_extract vecty:$xj, !add(imm1, 2))),
+                    !add(imm2, 2)),
+                (XVEXTRINS_D $xd, $xj, Imm)>;
+    }
+  }
+}
+
 let Predicates = [HasExtLASX] in {
 
 // XVADD_{B/H/W/D}
@@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">;
 defm : PatCCXrXrF<SETO, "XVFCMP_COR">;
 defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">;
 
+// Insert two elements extracted from vector into vector. (The positions
+// of the two elements must be same in the source or destination vector's
+// front and back 128bits.)
+// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D}
+// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D}
+foreach imm1 = 0...15 in {
+  foreach imm2 = 0...15 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert (vector_insert v32i8:$xd,
+                  (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2),
+                  (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))),
+                  !add(imm2, 16)),
+              (XVEXTRINS_B $xd, $xj, Imm)>;
+  }
+}
+
+foreach imm1 = 0...7 in {
+  foreach imm2 = 0...7 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert (vector_insert v16i16:$xd,
+                  (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2),
+                  (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))),
+                  !add(imm2, 8)),
+              (XVEXTRINS_H $xd, $xj, Imm)>;
+  }
+}
+
+defm : PairInsertExtractPatV8<v8i32, GRLenVT>;
+defm : PairInsertExtractPatV8<v8f32, f32>;
+defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
+defm : PairInsertExtractPatV4<v4f64, f64>;
+
 // PseudoXVINSGR2VR_{B/H}
 def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
           (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
@@ -1593,11 +1651,18 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
           (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
 def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
           (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
-          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
-          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm),
+          (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm),
+          (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+          (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+          (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+          (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+          (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
 
 // scalar_to_vector
 def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
@@ -1790,7 +1855,25 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
   def  : RegRegStPat<store, XVSTX, LASX256, vt>;
 }
 
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))),
+          (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))),
+          (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>;
+
 // Vector extraction with constant index.
+foreach imm = 16...31 in {
+  defvar Imm = !and(imm, 15);
+  def : Pat<(i64 (vector_extract v32i8:$xj, imm)),
+            (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128),
+                Imm)>;
+}
+foreach imm = 8...15 in {
+  defvar Imm = !and(imm, 7);
+  def : Pat<(i64 (vector_extract v16i16:$xj, imm)),
+            (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128),
+                Imm)>;
+}
 def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
           (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
 def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d78083ddcd..962e7c21431b1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst,
             (Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>;
 }
 
+multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...3 in {
+    foreach imm2 = 0...3 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert vecty:$vd,
+                    (elemty (vector_extract vecty:$vj, imm1)), imm2),
+                (VEXTRINS_W $vd, $vj, Imm)>;
+    }
+  }
+}
+
+multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
+  foreach imm1 = 0...1 in {
+    foreach imm2 = 0...1 in {
+      defvar Imm = !or(!shl(imm2, 4), imm1);
+      def : Pat<(vector_insert vecty:$vd,
+                    (elemty (vector_extract vecty:$vj, imm1)), imm2),
+                (VEXTRINS_D $vd, $vj, Imm)>;
+    }
+  }
+}
+
 let Predicates = [HasExtLSX] in {
 
 // VADD_{B/H/W/D}
@@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">;
 defm : PatCCVrVrF<SETO, "VFCMP_COR">;
 defm : PatCCVrVrF<SETUO, "VFCMP_CUN">;
 
+// Insert element extracted from vector into vector.
+// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D}
+foreach imm1 = 0...15 in {
+  foreach imm2 = 0...15 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert v16i8:$vd,
+                  (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2),
+              (VEXTRINS_B $vd, $vj, Imm)>;
+  }
+}
+
+foreach imm1 = 0...7 in {
+  foreach imm2 = 0...7 in {
+    defvar Imm = !or(!shl(imm2, 4), imm1);
+    def : Pat<(vector_insert v8i16:$vd,
+                  (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2),
+              (VEXTRINS_H $vd, $vj, Imm)>;
+  }
+}
+
+defm : InsertExtractPatV4<v4i32, GRLenVT>;
+defm : InsertExtractPatV4<v4f32, f32>;
+defm : InsertExtractPatV2<v2i64, GRLenVT>;
+defm : InsertExtractPatV2<v2f64, f64>;
+
 // VINSGR2VR_{B/H/W/D}
 def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm),
           (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>;
@@ -1791,7 +1838,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
           (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
 def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
           (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-
+def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm),
+          (VINSGR2VR_W $vd, $rj, uimm2:$imm)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm),
+          (VINSGR2VR_D $vd, $rj, uimm1:$imm)>;
 def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
           (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
 def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
@@ -1990,6 +2040,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
   def  : RegRegStPat<store, VSTX, LSX128, vt>;
 }
 
+// Bitcast float/double element extracted from vector to integer.
+def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))),
+          (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>;
+def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))),
+          (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>;
+
 // Vector extraction with constant index.
 def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)),
           (VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 1b8893029bb33..7b9f1156f9102 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -90,7 +90,7 @@ static void reportOutOfRangeError(MCContext &Ctx, SMLoc Loc, unsigned N) {
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
                                  MCContext &Ctx) {
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     llvm_unreachable("Unknown fixup kind");
   case FK_Data_1:
@@ -157,7 +157,7 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
   MCContext &Ctx = getContext();
 
   // Fixup leb128 separately.
-  if (Fixup.getTargetKind() == FK_Data_leb128)
+  if (Fixup.getKind() == FK_Data_leb128)
     return fixupLeb128(Ctx, Fixup, Data, Value);
 
   // Apply any target-specific value adjustments.
@@ -247,7 +247,7 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm,
 
 bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup,
                                                 const MCValue &Target) {
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     return STI.hasFeature(LoongArch::FeatureRelax);
   case FK_Data_1:
@@ -279,23 +279,23 @@ getRelocPairForSize(unsigned Size) {
   }
 }
 
-std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF,
+std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCFragment &F,
                                                        int64_t &Value) const {
-  const MCExpr &Expr = LF.getValue();
-  if (LF.isSigned() || !Expr.evaluateKnownAbsolute(Value, *Asm))
+  const MCExpr &Expr = F.getLEBValue();
+  if (F.isLEBSigned() || !Expr.evaluateKnownAbsolute(Value, *Asm))
     return std::make_pair(false, false);
-  LF.addFixup(MCFixup::create(0, &Expr, FK_Data_leb128));
+  F.setVarFixups({MCFixup::create(0, &Expr, FK_Data_leb128)});
   return std::make_pair(true, true);
 }
 
-bool LoongArchAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
+bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F,
                                              bool &WasRelaxed) const {
   MCContext &C = getContext();
 
-  int64_t LineDelta = DF.getLineDelta();
-  const MCExpr &AddrDelta = DF.getAddrDelta();
+  int64_t LineDelta = F.getDwarfLineDelta();
+  const MCExpr &AddrDelta = F.getDwarfAddrDelta();
   SmallVector<MCFixup, 1> Fixups;
-  size_t OldSize = DF.getContents().size();
+  size_t OldSize = F.getVarSize();
 
   int64_t Value;
   if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
@@ -349,17 +349,16 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
     OS << uint8_t(dwarf::DW_LNS_copy);
   }
 
-  DF.setContents(Data);
-  DF.setFixups(Fixups);
+  F.setVarContents(Data);
+  F.setVarFixups(Fixups);
   WasRelaxed = OldSize != Data.size();
   return true;
 }
 
-bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
-                                        bool &WasRelaxed) const {
-  const MCExpr &AddrDelta = DF.getAddrDelta();
+bool LoongArchAsmBackend::relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const {
+  const MCExpr &AddrDelta = F.getDwarfAddrDelta();
   SmallVector<MCFixup, 2> Fixups;
-  size_t OldSize = DF.getContents().size();
+  size_t OldSize = F.getVarContents().size();
 
   int64_t Value;
   if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
@@ -371,9 +370,9 @@ bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
   assert(getContext().getAsmInfo()->getMinInstAlignment() == 1 &&
          "expected 1-byte alignment");
   if (Value == 0) {
-    DF.clearContents();
-    DF.clearFixups();
-    WasRelaxed = OldSize != DF.getContents().size();
+    F.clearVarContents();
+    F.clearVarFixups();
+    WasRelaxed = OldSize != 0;
     return true;
   }
 
@@ -405,8 +404,8 @@ bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
   } else {
     llvm_unreachable("unsupported CFA encoding");
   }
-  DF.setContents(Data);
-  DF.setFixups(Fixups);
+  F.setVarContents(Data);
+  F.setVarFixups(Fixups);
 
   WasRelaxed = OldSize != Data.size();
   return true;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 4446cadf11e22..b32ba067810ce 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -59,11 +59,9 @@ class LoongArchAsmBackend : public MCAsmBackend {
 
   MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override;
 
-  bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
-                          bool &WasRelaxed) const override;
-  bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
-                     bool &WasRelaxed) const override;
-  std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF,
+  bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
+  bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
+  std::pair<bool, bool> relaxLEB128(MCFragment &F,
                                     int64_t &Value) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index faf3cba59a53c..fb741afa77e57 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -68,7 +68,7 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup,
     break;
   }
 
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
   if (mc::isRelocation(Fixup.getKind()))
     return Kind;
   switch (Kind) {
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index 1fdc1f799fe52..117dd31e7f053 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -30,7 +30,7 @@ class MSP430ELFObjectWriter : public MCELFObjectTargetWriter {
   unsigned getRelocType(const MCFixup &Fixup, const MCValue &,
                         bool IsPCRel) const override {
     // Translate fixup kind to ELF relocation type.
-    switch (Fixup.getTargetKind()) {
+    switch (Fixup.getKind()) {
     case FK_Data_1:                   return ELF::R_MSP430_8;
     case FK_Data_2:                   return ELF::R_MSP430_16_BYTE;
     case FK_Data_4:                   return ELF::R_MSP430_32;
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 01e4d17f6236d..259b71b37d9a3 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 
       TOut.getStreamer().emitRelocDirective(
           *TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
-          RelocJalrExpr, IDLoc, *STI);
+          RelocJalrExpr);
       TOut.getStreamer().emitLabel(TmpLabel);
     }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 8b73a7bdd4bc1..8ccd42ea0abfe 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -8,7 +8,6 @@ add_llvm_component_library(LLVMMipsDesc
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
   MipsMCTargetDesc.cpp
-  MipsNaClELFStreamer.cpp
   MipsOptionRecord.cpp
   MipsTargetStreamer.cpp
   MipsWinCOFFObjectWriter.cpp
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 25e31941bbb45..ad8f5f0a09745 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -156,7 +156,7 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                            const MCValue &Target,
                                            bool IsPCRel) const {
   // Determine the type of the relocation.
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
   switch (Target.getSpecifier()) {
   case Mips::S_DTPREL:
   case Mips::S_DTPREL_HI:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
deleted file mode 100644
index 94b2f412c8cdb..0000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
-#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
-
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/Support/Alignment.h"
-
-namespace llvm {
-
-// NaCl MIPS sandbox's instruction bundle size.
-static const Align MIPS_NACL_BUNDLE_ALIGN = Align(16);
-
-bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
-                                  bool *IsStore = nullptr);
-bool baseRegNeedsLoadStoreMask(MCRegister Reg);
-
-// This function creates an MCELFStreamer for Mips NaCl.
-MCELFStreamer *
-createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                          std::unique_ptr<MCObjectWriter> OW,
-                          std::unique_ptr<MCCodeEmitter> Emitter);
-}
-
-#endif
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index ab1eda0f48e15..2cc634154bffd 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -16,7 +16,6 @@
 #include "MipsELFStreamer.h"
 #include "MipsInstPrinter.h"
 #include "MipsMCAsmInfo.h"
-#include "MipsMCNaCl.h"
 #include "MipsTargetStreamer.h"
 #include "TargetInfo/MipsTargetInfo.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
@@ -199,12 +198,8 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
                                     std::unique_ptr<MCObjectWriter> &&OW,
                                     std::unique_ptr<MCCodeEmitter> &&Emitter) {
   MCStreamer *S;
-  if (!T.isOSNaCl())
-    S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW),
-                              std::move(Emitter));
-  else
-    S = createMipsNaClELFStreamer(Context, std::move(MAB), std::move(OW),
-                                  std::move(Emitter));
+  S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW),
+                            std::move(Emitter));
   return S;
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
deleted file mode 100644
index 3410726c8e553..0000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements MCELFStreamer for Mips NaCl.  It emits .o object files
-// as required by NaCl's SFI sandbox.  It inserts address-masking instructions
-// before dangerous control-flow and memory access instructions.  It inserts
-// address-masking instructions after instructions that change the stack
-// pointer.  It ensures that the mask and the dangerous instruction are always
-// emitted in the same bundle.  It aligns call + branch delay to the bundle end,
-// so that return address is always aligned to the start of next bundle.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsELFStreamer.h"
-#include "MipsMCNaCl.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mips-mc-nacl"
-
-namespace {
-
-const unsigned IndirectBranchMaskReg = Mips::T6;
-const unsigned LoadStoreStackMaskReg = Mips::T7;
-
-/// Extend the generic MCELFStreamer class so that it can mask dangerous
-/// instructions.
-
-class MipsNaClELFStreamer : public MipsELFStreamer {
-public:
-  MipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                      std::unique_ptr<MCObjectWriter> OW,
-                      std::unique_ptr<MCCodeEmitter> Emitter)
-      : MipsELFStreamer(Context, std::move(TAB), std::move(OW),
-                        std::move(Emitter)) {}
-
-  ~MipsNaClELFStreamer() override = default;
-
-private:
-  // Whether we started the sandboxing sequence for calls.  Calls are bundled
-  // with branch delays and aligned to the bundle end.
-  bool PendingCall = false;
-
-  bool isIndirectJump(const MCInst &MI) {
-    if (MI.getOpcode() == Mips::JALR) {
-      // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead.
-      // JALR is an indirect branch if the link register is $0.
-      assert(MI.getOperand(0).isReg());
-      return MI.getOperand(0).getReg() == Mips::ZERO;
-    }
-    return MI.getOpcode() == Mips::JR;
-  }
-
-  bool isStackPointerFirstOperand(const MCInst &MI) {
-    return (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()
-            && MI.getOperand(0).getReg() == Mips::SP);
-  }
-
-  bool isCall(const MCInst &MI, bool *IsIndirectCall) {
-    unsigned Opcode = MI.getOpcode();
-
-    *IsIndirectCall = false;
-
-    switch (Opcode) {
-    default:
-      return false;
-
-    case Mips::JAL:
-    case Mips::BAL:
-    case Mips::BAL_BR:
-    case Mips::BLTZAL:
-    case Mips::BGEZAL:
-      return true;
-
-    case Mips::JALR:
-      // JALR is only a call if the link register is not $0. Otherwise it's an
-      // indirect branch.
-      assert(MI.getOperand(0).isReg());
-      if (MI.getOperand(0).getReg() == Mips::ZERO)
-        return false;
-
-      *IsIndirectCall = true;
-      return true;
-    }
-  }
-
-  void emitMask(MCRegister AddrReg, unsigned MaskReg,
-                const MCSubtargetInfo &STI) {
-    MCInst MaskInst;
-    MaskInst.setOpcode(Mips::AND);
-    MaskInst.addOperand(MCOperand::createReg(AddrReg));
-    MaskInst.addOperand(MCOperand::createReg(AddrReg));
-    MaskInst.addOperand(MCOperand::createReg(MaskReg));
-    MipsELFStreamer::emitInstruction(MaskInst, STI);
-  }
-
-  // Sandbox indirect branch or return instruction by inserting mask operation
-  // before it.
-  void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
-    MCRegister AddrReg = MI.getOperand(0).getReg();
-
-    emitBundleLock(false);
-    emitMask(AddrReg, IndirectBranchMaskReg, STI);
-    MipsELFStreamer::emitInstruction(MI, STI);
-    emitBundleUnlock();
-  }
-
-  // Sandbox memory access or SP change.  Insert mask operation before and/or
-  // after the instruction.
-  void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx,
-                                   const MCSubtargetInfo &STI, bool MaskBefore,
-                                   bool MaskAfter) {
-    emitBundleLock(false);
-    if (MaskBefore) {
-      // Sandbox memory access.
-      MCRegister BaseReg = MI.getOperand(AddrIdx).getReg();
-      emitMask(BaseReg, LoadStoreStackMaskReg, STI);
-    }
-    MipsELFStreamer::emitInstruction(MI, STI);
-    if (MaskAfter) {
-      // Sandbox SP change.
-      MCRegister SPReg = MI.getOperand(0).getReg();
-      assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
-      emitMask(SPReg, LoadStoreStackMaskReg, STI);
-    }
-    emitBundleUnlock();
-  }
-
-public:
-  /// This function is the one used to emit instruction data into the ELF
-  /// streamer.  We override it to mask dangerous instructions.
-  void emitInstruction(const MCInst &Inst,
-                       const MCSubtargetInfo &STI) override {
-    // Sandbox indirect jumps.
-    if (isIndirectJump(Inst)) {
-      if (PendingCall)
-        report_fatal_error("Dangerous instruction in branch delay slot!");
-      sandboxIndirectJump(Inst, STI);
-      return;
-    }
-
-    // Sandbox loads, stores and SP changes.
-    unsigned AddrIdx = 0;
-    bool IsStore = false;
-    bool IsMemAccess = isBasePlusOffsetMemoryAccess(Inst.getOpcode(), &AddrIdx,
-                                                    &IsStore);
-    bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
-    if (IsMemAccess || IsSPFirstOperand) {
-      bool MaskBefore = (IsMemAccess
-                         && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
-                                                          .getReg()));
-      bool MaskAfter = IsSPFirstOperand && !IsStore;
-      if (MaskBefore || MaskAfter) {
-        if (PendingCall)
-          report_fatal_error("Dangerous instruction in branch delay slot!");
-        sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
-        return;
-      }
-      // fallthrough
-    }
-
-    // Sandbox calls by aligning call and branch delay to the bundle end.
-    // For indirect calls, emit the mask before the call.
-    bool IsIndirectCall;
-    if (isCall(Inst, &IsIndirectCall)) {
-      if (PendingCall)
-        report_fatal_error("Dangerous instruction in branch delay slot!");
-
-      // Start the sandboxing sequence by emitting call.
-      emitBundleLock(true);
-      if (IsIndirectCall) {
-        MCRegister TargetReg = Inst.getOperand(1).getReg();
-        emitMask(TargetReg, IndirectBranchMaskReg, STI);
-      }
-      MipsELFStreamer::emitInstruction(Inst, STI);
-      PendingCall = true;
-      return;
-    }
-    if (PendingCall) {
-      // Finish the sandboxing sequence by emitting branch delay.
-      MipsELFStreamer::emitInstruction(Inst, STI);
-      emitBundleUnlock();
-      PendingCall = false;
-      return;
-    }
-
-    // None of the sandboxing applies, just emit the instruction.
-    MipsELFStreamer::emitInstruction(Inst, STI);
-  }
-};
-
-} // end anonymous namespace
-
-namespace llvm {
-
-bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
-                                  bool *IsStore) {
-  if (IsStore)
-    *IsStore = false;
-
-  switch (Opcode) {
-  default:
-    return false;
-
-  // Load instructions with base address register in position 1.
-  case Mips::LB:
-  case Mips::LBu:
-  case Mips::LH:
-  case Mips::LHu:
-  case Mips::LW:
-  case Mips::LWC1:
-  case Mips::LDC1:
-  case Mips::LL:
-  case Mips::LL_R6:
-  case Mips::LWL:
-  case Mips::LWR:
-    *AddrIdx = 1;
-    return true;
-
-  // Store instructions with base address register in position 1.
-  case Mips::SB:
-  case Mips::SH:
-  case Mips::SW:
-  case Mips::SWC1:
-  case Mips::SDC1:
-  case Mips::SWL:
-  case Mips::SWR:
-    *AddrIdx = 1;
-    if (IsStore)
-      *IsStore = true;
-    return true;
-
-  // Store instructions with base address register in position 2.
-  case Mips::SC:
-  case Mips::SC_R6:
-    *AddrIdx = 2;
-    if (IsStore)
-      *IsStore = true;
-    return true;
-  }
-}
-
-bool baseRegNeedsLoadStoreMask(MCRegister Reg) {
-  // The contents of SP and thread pointer register do not require masking.
-  return Reg != Mips::SP && Reg != Mips::T8;
-}
-
-MCELFStreamer *
-createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
-                          std::unique_ptr<MCObjectWriter> OW,
-                          std::unique_ptr<MCCodeEmitter> Emitter) {
-  MipsNaClELFStreamer *S = new MipsNaClELFStreamer(
-      Context, std::move(TAB), std::move(OW), std::move(Emitter));
-
-  // Set bundle-alignment as required by the NaCl ABI for the target.
-  S->emitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
-
-  return S;
-}
-
-} // end namespace llvm
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index c69fc68ab5af1..b89d6890903dd 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1033,42 +1033,42 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
 }
 
 void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
-  MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+  MCFragment *DF = getStreamer().getOrCreateDataFragment();
   DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
                                Mips::fixup_Mips_GPREL32));
   DF->appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
-  MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+  MCFragment *DF = getStreamer().getOrCreateDataFragment();
   DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
                                Mips::fixup_Mips_GPREL32));
   DF->appendContents(8, 0);
 }
 
 void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
-  MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+  MCFragment *DF = getStreamer().getOrCreateDataFragment();
   DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
                                Mips::fixup_Mips_DTPREL32));
   DF->appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
-  MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+  MCFragment *DF = getStreamer().getOrCreateDataFragment();
   DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
                                Mips::fixup_Mips_DTPREL64));
   DF->appendContents(8, 0);
 }
 
 void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
-  MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+  MCFragment *DF = getStreamer().getOrCreateDataFragment();
   DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
                                Mips::fixup_Mips_TPREL32));
   DF->appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
-  MCDataFragment *DF = getStreamer().getOrCreateDataFragment();
+  MCFragment *DF = getStreamer().getOrCreateDataFragment();
   DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
                                Mips::fixup_Mips_TPREL64));
   DF->appendContents(8, 0);
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 87e06a6d3c08a..ca0331006be74 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -16,7 +16,6 @@
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsMCAsmInfo.h"
-#include "MCTargetDesc/MipsMCNaCl.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MCTargetDesc/MipsTargetStreamer.h"
 #include "Mips.h"
@@ -87,10 +86,6 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       StubsNeeded.insert(I);
   MCP = MF.getConstantPool();
 
-  // In NaCl, all indirect jump targets must be aligned to bundle size.
-  if (Subtarget->isTargetNaCl())
-    NaClAlignIndirectJumpTargets(MF);
-
   AsmPrinter::runOnMachineFunction(MF);
 
   emitXRayTable();
@@ -171,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI,
         OutStreamer.emitRelocDirective(
             *OffsetExpr,
             Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
-            CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo());
+            CaleeExpr);
         OutStreamer.emitLabel(OffsetLabel);
         return;
       }
@@ -401,11 +396,6 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
 void MipsAsmPrinter::emitFunctionEntryLabel() {
   MipsTargetStreamer &TS = getTargetStreamer();
 
-  // NaCl sandboxing requires that indirect call instructions are masked.
-  // This means that function entry points should be bundle-aligned.
-  if (Subtarget->isTargetNaCl())
-    emitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
-
   if (Subtarget->inMicroMipsMode()) {
     TS.emitDirectiveSetMicroMips();
     TS.setUsesMicroMips();
@@ -1263,27 +1253,6 @@ void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
   AsmPrinter::emitDebugValue(Value, Size);
 }
 
-// Align all targets of indirect branches on bundle size.  Used only if target
-// is NaCl.
-void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
-  // Align all blocks that are jumped to through jump table.
-  if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) {
-    const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables();
-    for (const auto &I : JT) {
-      const std::vector<MachineBasicBlock *> &MBBs = I.MBBs;
-
-      for (MachineBasicBlock *MBB : MBBs)
-        MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
-    }
-  }
-
-  // If basic block address is taken, block can be target of indirect branch.
-  for (auto &MBB : MF) {
-    if (MBB.hasAddressTaken())
-      MBB.setAlignment(MIPS_NACL_BUNDLE_ALIGN);
-  }
-}
-
 bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
   return (Opcode == Mips::LONG_BRANCH_LUi
           || Opcode == Mips::LONG_BRANCH_LUi2Op
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index bbaa3b3cef9de..8b2fb32dc552d 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -112,8 +112,6 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
 
   void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *);
 
-  void NaClAlignIndirectJumpTargets(MachineFunction &MF);
-
   bool isLongBranchPseudo(int Opcode) const;
 
 public:
diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
index 43b80c541d8a9..3720c936643b4 100644
--- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -74,7 +74,6 @@
 
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsMCNaCl.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
@@ -518,27 +517,19 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
           .addReg(Mips::SP)
           .addImm(0);
-      if (STI->isTargetNaCl())
-        // Bundle-align the target of indirect branch JR.
-        TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
 
-      // In NaCl, modifying the sp is not allowed in branch delay slot.
       // For MIPS32R6, we can skip using a delay slot branch.
       bool hasDelaySlot = buildProperJumpMI(BalTgtMBB, Pos, DL);
 
-      if (STI->isTargetNaCl() || !hasDelaySlot) {
+      if (!hasDelaySlot) {
         BuildMI(*BalTgtMBB, std::prev(Pos), DL, TII->get(Mips::ADDiu), Mips::SP)
             .addReg(Mips::SP)
             .addImm(8);
       }
       if (hasDelaySlot) {
-        if (STI->isTargetNaCl()) {
-          TII->insertNop(*BalTgtMBB, Pos, DL);
-        } else {
-          BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
-              .addReg(Mips::SP)
-              .addImm(8);
-        }
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+            .addReg(Mips::SP)
+            .addImm(8);
         BalTgtMBB->rbegin()->bundleWithPred();
       }
     } else {
@@ -899,14 +890,6 @@ bool MipsBranchExpansion::handlePossibleLongBranch() {
            (Br->isUnconditionalBranch() && IsPIC))) {
         int64_t Offset = computeOffset(&*Br);
 
-        if (STI->isTargetNaCl()) {
-          // The offset calculation does not include sandboxing instructions
-          // that will be added later in the MC layer.  Since at this point we
-          // don't know the exact amount of code that "sandboxing" will add, we
-          // conservatively estimate that code will not grow more than 100%.
-          Offset *= 2;
-        }
-
         if (ForceLongBranchFirstPass ||
             !TII->isBranchOffsetInRange(Br->getOpcode(), Offset)) {
           MBBInfos[I].Offset = Offset;
diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 3c60114f507b9..39e184a6303a5 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -267,15 +267,8 @@ def CC_Mips_FastCC : CallingConv<[
 
   // Integer arguments are passed in integer registers. All scratch registers,
   // except for AT, V0 and T9, are available to be used as argument registers.
-  CCIfType<[i32], CCIfSubtargetNot<"isTargetNaCl()",
-      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>,
-
-  // In NaCl, T6, T7 and T8 are reserved and not available as argument
-  // registers for fastcc.  T6 contains the mask for sandboxing control flow
-  // (indirect jumps and calls).  T7 contains the mask for sandboxing memory
-  // accesses (loads and stores).  T8 contains the thread pointer.
-  CCIfType<[i32], CCIfSubtarget<"isTargetNaCl()",
-      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>,
+  CCIfType<[i32],
+      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>,
 
   // f32 arguments are passed in single-precision floating pointer registers.
   CCIfType<[f32], CCIfSubtarget<"useOddSPReg()",
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index b13394a607f6a..dfbbcbe602191 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
 #include "MipsSubtarget.h"
@@ -727,18 +726,6 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
       continue;
 
     const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
-    if (STI.isTargetNaCl()) {
-      // In NaCl, instructions that must be masked are forbidden in delay slots.
-      // We only check for loads, stores and SP changes.  Calls, returns and
-      // branches are not checked because non-NaCl targets never put them in
-      // delay slots.
-      unsigned AddrIdx;
-      if ((isBasePlusOffsetMemoryAccess(CurrI->getOpcode(), &AddrIdx) &&
-           baseRegNeedsLoadStoreMask(CurrI->getOperand(AddrIdx).getReg())) ||
-          CurrI->modifiesRegister(Mips::SP, STI.getRegisterInfo()))
-        continue;
-    }
-
     bool InMicroMipsMode = STI.inMicroMipsMode();
     const MipsInstrInfo *TII = STI.getInstrInfo();
     unsigned Opcode = (*Slot).getOpcode();
diff --git a/llvm/lib/Target/Mips/MipsInstrFPU.td b/llvm/lib/Target/Mips/MipsInstrFPU.td
index 14590ddacfcb7..4ca329d214981 100644
--- a/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -622,15 +622,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
 
 // Indexed loads and stores.
 // Base register + offset register addressing mode (indicated by "x" in the
-// instruction mnemonic) is disallowed under NaCl.
-let AdditionalPredicates = [IsNotNaCl] in {
-  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
-              INSN_MIPS4_32R2_NOT_32R6_64R6;
-  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
-              INSN_MIPS4_32R2_NOT_32R6_64R6;
-}
+// instruction mnemonic).
+def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
+            INSN_MIPS4_32R2_NOT_32R6_64R6;
+def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
+            INSN_MIPS4_32R2_NOT_32R6_64R6;
 
-let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
+let AdditionalPredicates = [NotInMicroMips] in {
   def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
               INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
   def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
@@ -646,14 +644,14 @@ let DecoderNamespace="MipsFP64" in {
 
 // Load/store doubleword indexed unaligned.
 // FIXME: This instruction should not be defined for FGR_32.
-let AdditionalPredicates = [IsNotNaCl, NotInMicroMips] in {
+let AdditionalPredicates = [NotInMicroMips] in {
   def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
               INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
   def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
               INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
 }
 
-let AdditionalPredicates = [IsNotNaCl, NotInMicroMips],
+let AdditionalPredicates = [NotInMicroMips],
     DecoderNamespace="MipsFP64" in {
   def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
                 INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index b6125b972717a..a124e84e9ca5f 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -236,7 +236,6 @@ def NotInMicroMips :  Predicate<"!Subtarget->inMicroMipsMode()">,
                       AssemblerPredicate<(all_of (not FeatureMicroMips))>;
 def IsLE           :  Predicate<"Subtarget->isLittle()">;
 def IsBE           :  Predicate<"!Subtarget->isLittle()">;
-def IsNotNaCl    :    Predicate<"!Subtarget->isTargetNaCl()">;
 def UseTCCInDIV    :  AssemblerPredicate<(all_of FeatureUseTCCInDIV)>;
 def HasEVA       :    Predicate<"Subtarget->hasEVA()">,
                       AssemblerPredicate<(all_of FeatureEVA)>;
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index ae4b2377ad218..539288e8da592 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -162,13 +162,6 @@ getReservedRegs(const MachineFunction &MF) const {
   for (MCPhysReg R : ReservedGPR32)
     Reserved.set(R);
 
-  // Reserve registers for the NaCl sandbox.
-  if (Subtarget.isTargetNaCl()) {
-    Reserved.set(Mips::T6);   // Reserved for control flow mask.
-    Reserved.set(Mips::T7);   // Reserved for memory access mask.
-    Reserved.set(Mips::T8);   // Reserved for thread pointer.
-  }
-
   for (MCPhysReg R : ReservedGPR64)
     Reserved.set(R);
 
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h
index bb026f565512f..52f892a160c38 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -355,7 +355,6 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
 
   bool os16() const { return Os16; }
 
-  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
 
   bool isXRaySupported() const override { return true; }
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 443db4391a523..8eec91562ecfe 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -268,8 +268,8 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
   llvm_unreachable("Empty Modifier");
 }
 
-void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
-                                     raw_ostream &O, StringRef Modifier) {
+void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, StringRef Modifier) {
   const MCOperand &MO = MI->getOperand(OpNum);
   int Imm = (int)MO.getImm();
   if (Modifier == "sem") {
@@ -286,22 +286,24 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
     case NVPTX::Ordering::Release:
       O << ".release";
       return;
+    case NVPTX::Ordering::AcquireRelease:
+      O << ".acq_rel";
+      return;
+    case NVPTX::Ordering::SequentiallyConsistent:
+      O << ".seq_cst";
+      return;
     case NVPTX::Ordering::Volatile:
       O << ".volatile";
       return;
     case NVPTX::Ordering::RelaxedMMIO:
       O << ".mmio.relaxed";
       return;
-    default:
-      report_fatal_error(formatv(
-          "NVPTX LdStCode Printer does not support \"{}\" sem modifier. "
-          "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
-          OrderingToString(Ordering)));
     }
   } else if (Modifier == "scope") {
     auto S = NVPTX::Scope(Imm);
     switch (S) {
     case NVPTX::Scope::Thread:
+    case NVPTX::Scope::DefaultDevice:
       return;
     case NVPTX::Scope::System:
       O << ".sys";
@@ -316,9 +318,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
       O << ".gpu";
       return;
     }
-    report_fatal_error(
-        formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
-                ScopeToString(S)));
+    report_fatal_error(formatv(
+        "NVPTX AtomicCode Printer does not support \"{}\" scope modifier.",
+        ScopeToString(S)));
   } else if (Modifier == "addsp") {
     auto A = NVPTX::AddressSpace(Imm);
     switch (A) {
@@ -334,7 +336,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
       return;
     }
     report_fatal_error(formatv(
-        "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.",
+        "NVPTX AtomicCode Printer does not support \"{}\" addsp modifier.",
         AddressSpaceToString(A)));
   } else if (Modifier == "sign") {
     switch (Imm) {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index 193c436939f66..c3ff3469150e4 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -40,8 +40,8 @@ class NVPTXInstPrinter : public MCInstPrinter {
                     StringRef Modifier = {});
   void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
                     StringRef Modifier = {});
-  void printLdStCode(const MCInst *MI, int OpNum, raw_ostream &O,
-                     StringRef Modifier = {});
+  void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O,
+                       StringRef Modifier = {});
   void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
                     StringRef Modifier = {});
   void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O,
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 15997bc3878d8..77a0e03d4075a 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -164,7 +164,6 @@ enum Ordering : OrderingUnderlyingType {
       (OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent,
   Volatile = SequentiallyConsistent + 1,
   RelaxedMMIO = Volatile + 1,
-  LASTORDERING = RelaxedMMIO
 };
 
 using ScopeUnderlyingType = unsigned int;
@@ -174,7 +173,8 @@ enum Scope : ScopeUnderlyingType {
   Cluster = 2,
   Device = 3,
   System = 4,
-  LASTSCOPE = System
+  DefaultDevice = 5, //  For SM < 70: denotes PTX op implicit/default .gpu scope
+  LASTSCOPE = DefaultDevice
 };
 
 using AddressSpaceUnderlyingType = unsigned int;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ae73d8da79f8e..65e7c56774547 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -494,7 +494,7 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
   return true;
 }
 
-static std::optional<unsigned> convertAS(unsigned AS) {
+static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
   switch (AS) {
   case llvm::ADDRESS_SPACE_LOCAL:
     return NVPTX::AddressSpace::Local;
@@ -515,11 +515,42 @@ static std::optional<unsigned> convertAS(unsigned AS) {
   }
 }
 
-static unsigned int getCodeAddrSpace(const MemSDNode *N) {
+NVPTX::AddressSpace NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) {
   return convertAS(N->getMemOperand()->getAddrSpace())
       .value_or(NVPTX::AddressSpace::Generic);
 }
 
+NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
+  // No "sem" orderings for SM/PTX versions which do not support memory ordering
+  if (!Subtarget->hasMemoryOrdering())
+    return NVPTX::Ordering::NotAtomic;
+  auto Ordering = N->getMergedOrdering();
+  switch (Ordering) {
+  case AtomicOrdering::NotAtomic:
+    return NVPTX::Ordering::NotAtomic;
+  case AtomicOrdering::Unordered:
+  case AtomicOrdering::Monotonic:
+    return NVPTX::Ordering::Relaxed;
+  case AtomicOrdering::Acquire:
+    return NVPTX::Ordering::Acquire;
+  case AtomicOrdering::Release:
+    return NVPTX::Ordering::Release;
+  case AtomicOrdering::AcquireRelease:
+    return NVPTX::Ordering::AcquireRelease;
+  case AtomicOrdering::SequentiallyConsistent:
+    return NVPTX::Ordering::SequentiallyConsistent;
+  }
+  llvm_unreachable("Invalid atomic ordering");
+}
+
+NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
+  // No "scope" modifier for SM/PTX versions which do not support scoped atomics
+  // Functionally, these atomics are at device scope
+  if (!Subtarget->hasAtomScope())
+    return NVPTX::Scope::DefaultDevice;
+  return Scopes[N->getSyncScopeID()];
+}
+
 namespace {
 
 struct OperationOrderings {
@@ -532,7 +563,7 @@ struct OperationOrderings {
 static OperationOrderings
 getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
   AtomicOrdering Ordering = N->getSuccessOrdering();
-  auto CodeAddrSpace = getCodeAddrSpace(N);
+  auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
 
   bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
   bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
@@ -756,7 +787,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
 }
 
 static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
-                          unsigned CodeAddrSpace) {
+                          NVPTX::AddressSpace CodeAddrSpace) {
   // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
   // space.
   return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
@@ -788,6 +819,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
       return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
                                     : NVPTX::INT_MEMBAR_GL;
     case NVPTX::Scope::Thread:
+    case NVPTX::Scope::DefaultDevice:
       report_fatal_error(
           formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
                   ScopeToString(S)));
@@ -807,6 +839,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
       return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
                                     : NVPTX::INT_MEMBAR_GL;
     case NVPTX::Scope::Thread:
+    case NVPTX::Scope::DefaultDevice:
       report_fatal_error(
           formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
                   ScopeToString(S)));
@@ -826,6 +859,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
       return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
                                     : NVPTX::INT_MEMBAR_GL;
     case NVPTX::Scope::Thread:
+    case NVPTX::Scope::DefaultDevice:
       report_fatal_error(
           formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
                   ScopeToString(S)));
@@ -846,6 +880,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
       return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
                                     : NVPTX::INT_MEMBAR_GL;
     case NVPTX::Scope::Thread:
+    case NVPTX::Scope::DefaultDevice:
       report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
                                  ScopeToString(S)));
     }
@@ -1025,7 +1060,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   const MVT LoadedVT = LoadedEVT.getSimpleVT();
 
   // Address Space Setting
-  const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+  const auto CodeAddrSpace = getAddrSpace(LD);
   if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
     return tryLDG(LD);
 
@@ -1097,7 +1132,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   const MVT MemVT = MemEVT.getSimpleVT();
 
   // Address Space Setting
-  const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+  const auto CodeAddrSpace = getAddrSpace(LD);
   if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
     return tryLDG(LD);
 
@@ -1313,7 +1348,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
     return false;
 
   // Address Space Setting
-  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
+  const auto CodeAddrSpace = getAddrSpace(ST);
 
   SDLoc DL(ST);
   SDValue Chain = ST->getChain();
@@ -1363,7 +1398,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   assert(StoreVT.isSimple() && "Store value is not simple");
 
   // Address Space Setting
-  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
+  const auto CodeAddrSpace = getAddrSpace(ST);
   if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
     report_fatal_error("Cannot store to pointer that points to constant "
                        "memory space");
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 88e5328ff69c5..b99b4ef2d3076 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -100,6 +100,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
   }
+  NVPTX::Ordering getMemOrder(const MemSDNode *N) const;
+  NVPTX::Scope getAtomicScope(const MemSDNode *N) const;
 
   bool SelectADDR(SDValue Addr, SDValue &Base, SDValue &Offset);
   SDValue getPTXCmpMode(const CondCodeSDNode &CondCode);
@@ -114,6 +116,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   std::pair<NVPTX::Ordering, NVPTX::Scope>
   insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N);
   NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const;
+
+public:
+  static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N);
 };
 
 class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 14f05250ad6b8..7aa06f9079b09 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1048,9 +1048,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
                       MVT::v32i32, MVT::v64i32, MVT::v128i32},
                      Custom);
 
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom);
+  // Enable custom lowering for the following:
+  //   * MVT::i128 - clusterlaunchcontrol
+  //   * MVT::i32 - prmt
+  //   * MVT::Other - internal.addrspace.wrap
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
+                     Custom);
 }
 
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -2060,6 +2063,19 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
+static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL,
+                       SelectionDAG &DAG,
+                       unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+  return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
+                     {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
+}
+
+static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL,
+                       SelectionDAG &DAG,
+                       unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
+  return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
+}
+
 SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcasting from v2i8 without hitting the default promotion
   // strategy which goes through stack memory.
@@ -2111,15 +2127,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
         L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
         R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
       }
-      return DAG.getNode(
-          NVPTXISD::PRMT, DL, MVT::v4i8,
-          {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32),
-           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      return getPRMT(L, R, SelectionValue, DL, DAG);
     };
     auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
     auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
     auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
-    return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
+    return DAG.getBitcast(VT, PRMT3210);
   }
 
   // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
@@ -2176,11 +2189,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
                                    DAG.getZExtOrTrunc(Index, DL, MVT::i32),
                                    DAG.getConstant(0x7770, DL, MVT::i32));
-    SDValue PRMT = DAG.getNode(
-        NVPTXISD::PRMT, DL, MVT::i32,
-        {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32),
-         Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
-    return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
+    SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
+                           DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
+    SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
+    SDNodeFlags Flags;
+    Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
+    Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
+    Ext->setFlags(Flags);
+    return Ext;
   }
 
   // Constant index will be matched by tablegen.
@@ -2242,9 +2258,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   }
 
   SDLoc DL(Op);
-  return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
-                     DAG.getConstant(Selector, DL, MVT::i32),
-                     DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
+  SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
+                         DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
+  return DAG.getBitcast(Op.getValueType(), PRMT);
 }
 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
@@ -2729,10 +2745,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op,
                      {TryCancelResponse0, TryCancelResponse1});
 }
 
+static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
+  const unsigned Mode = [&]() {
+    switch (Op->getConstantOperandVal(0)) {
+    case Intrinsic::nvvm_prmt:
+      return NVPTX::PTXPrmtMode::NONE;
+    case Intrinsic::nvvm_prmt_b4e:
+      return NVPTX::PTXPrmtMode::B4E;
+    case Intrinsic::nvvm_prmt_ecl:
+      return NVPTX::PTXPrmtMode::ECL;
+    case Intrinsic::nvvm_prmt_ecr:
+      return NVPTX::PTXPrmtMode::ECR;
+    case Intrinsic::nvvm_prmt_f4e:
+      return NVPTX::PTXPrmtMode::F4E;
+    case Intrinsic::nvvm_prmt_rc16:
+      return NVPTX::PTXPrmtMode::RC16;
+    case Intrinsic::nvvm_prmt_rc8:
+      return NVPTX::PTXPrmtMode::RC8;
+    default:
+      llvm_unreachable("unsupported/unhandled intrinsic");
+    }
+  }();
+  SDLoc DL(Op);
+  SDValue A = Op->getOperand(1);
+  SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
+                                       : DAG.getConstant(0, DL, MVT::i32);
+  SDValue Selector = (Op->op_end() - 1)->get();
+  return getPRMT(A, B, Selector, DL, DAG, Mode);
+}
 static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
   switch (Op->getConstantOperandVal(0)) {
   default:
     return Op;
+  case Intrinsic::nvvm_prmt:
+  case Intrinsic::nvvm_prmt_b4e:
+  case Intrinsic::nvvm_prmt_ecl:
+  case Intrinsic::nvvm_prmt_ecr:
+  case Intrinsic::nvvm_prmt_f4e:
+  case Intrinsic::nvvm_prmt_rc16:
+  case Intrinsic::nvvm_prmt_rc8:
+    return lowerPrmtIntrinsic(Op, DAG);
   case Intrinsic::nvvm_internal_addrspace_wrap:
     return Op.getOperand(1);
   case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
@@ -5775,11 +5827,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc DL(N);
   auto &DAG = DCI.DAG;
 
-  auto PRMT = DAG.getNode(
-      NVPTXISD::PRMT, DL, MVT::v4i8,
-      {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32),
-       DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
-  return DAG.getNode(ISD::BITCAST, DL, VT, PRMT);
+  auto PRMT =
+      getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
+              (Op1Bytes << 8) | Op0Bytes, DL, DAG);
+  return DAG.getBitcast(VT, PRMT);
 }
 
 static SDValue combineADDRSPACECAST(SDNode *N,
@@ -5797,47 +5848,120 @@ static SDValue combineADDRSPACECAST(SDNode *N,
   return SDValue();
 }
 
+// Given a constant selector value and a prmt mode, return the selector value
+// normalized to the generic prmt mode. See the PTX ISA documentation for more
+// details:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
+  if (Mode == NVPTX::PTXPrmtMode::NONE)
+    return Selector;
+
+  const unsigned V = Selector.trunc(2).getZExtValue();
+
+  const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
+                              unsigned S3) {
+    return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
+  };
+
+  switch (Mode) {
+  case NVPTX::PTXPrmtMode::F4E:
+    return GetSelector(V, V + 1, V + 2, V + 3);
+  case NVPTX::PTXPrmtMode::B4E:
+    return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
+  case NVPTX::PTXPrmtMode::RC8:
+    return GetSelector(V, V, V, V);
+  case NVPTX::PTXPrmtMode::ECL:
+    return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
+  case NVPTX::PTXPrmtMode::ECR:
+    return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
+  case NVPTX::PTXPrmtMode::RC16: {
+    unsigned V1 = (V & 1) << 1;
+    return GetSelector(V1, V1 + 1, V1, V1 + 1);
+  }
+  default:
+    llvm_unreachable("Invalid PRMT mode");
+  }
+}
+
+static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
+  // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
+  APInt BitField = B.concat(A);
+  APInt SelectorVal = getPRMTSelector(Selector, Mode);
+  APInt Result(32, 0);
+  for (unsigned I : llvm::seq(4U)) {
+    APInt Sel = SelectorVal.extractBits(4, I * 4);
+    unsigned Idx = Sel.getLoBits(3).getZExtValue();
+    unsigned Sign = Sel.getHiBits(1).getZExtValue();
+    APInt Byte = BitField.extractBits(8, Idx * 8);
+    if (Sign)
+      Byte = Byte.ashr(8);
+    Result.insertBits(Byte, I * 8);
+  }
+  return Result;
+}
+
+static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                           CodeGenOptLevel OptLevel) {
+  if (OptLevel == CodeGenOptLevel::None)
+    return SDValue();
+
+  // Constant fold PRMT
+  if (isa<ConstantSDNode>(N->getOperand(0)) &&
+      isa<ConstantSDNode>(N->getOperand(1)) &&
+      isa<ConstantSDNode>(N->getOperand(2)))
+    return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
+                                           N->getConstantOperandAPInt(1),
+                                           N->getConstantOperandAPInt(2),
+                                           N->getConstantOperandVal(3)),
+                               SDLoc(N), N->getValueType(0));
+
+  return SDValue();
+}
+
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
   switch (N->getOpcode()) {
-    default: break;
-    case ISD::ADD:
-      return PerformADDCombine(N, DCI, OptLevel);
-    case ISD::FADD:
-      return PerformFADDCombine(N, DCI, OptLevel);
-    case ISD::MUL:
-      return PerformMULCombine(N, DCI, OptLevel);
-    case ISD::SHL:
-      return PerformSHLCombine(N, DCI, OptLevel);
-    case ISD::AND:
-      return PerformANDCombine(N, DCI);
-    case ISD::UREM:
-    case ISD::SREM:
-      return PerformREMCombine(N, DCI, OptLevel);
-    case ISD::SETCC:
-      return PerformSETCCCombine(N, DCI, STI.getSmVersion());
-    case ISD::LOAD:
-    case NVPTXISD::LoadParamV2:
-    case NVPTXISD::LoadV2:
-    case NVPTXISD::LoadV4:
-      return combineUnpackingMovIntoLoad(N, DCI);
-    case NVPTXISD::StoreParam:
-    case NVPTXISD::StoreParamV2:
-    case NVPTXISD::StoreParamV4:
-      return PerformStoreParamCombine(N, DCI);
-    case ISD::STORE:
-    case NVPTXISD::StoreV2:
-    case NVPTXISD::StoreV4:
-      return PerformStoreCombine(N, DCI);
-    case ISD::EXTRACT_VECTOR_ELT:
-      return PerformEXTRACTCombine(N, DCI);
-    case ISD::VSELECT:
-      return PerformVSELECTCombine(N, DCI);
-    case ISD::BUILD_VECTOR:
-      return PerformBUILD_VECTORCombine(N, DCI);
-    case ISD::ADDRSPACECAST:
-      return combineADDRSPACECAST(N, DCI);
+  default:
+    break;
+  case ISD::ADD:
+    return PerformADDCombine(N, DCI, OptLevel);
+  case ISD::ADDRSPACECAST:
+    return combineADDRSPACECAST(N, DCI);
+  case ISD::AND:
+    return PerformANDCombine(N, DCI);
+  case ISD::BUILD_VECTOR:
+    return PerformBUILD_VECTORCombine(N, DCI);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return PerformEXTRACTCombine(N, DCI);
+  case ISD::FADD:
+    return PerformFADDCombine(N, DCI, OptLevel);
+  case ISD::LOAD:
+  case NVPTXISD::LoadParamV2:
+  case NVPTXISD::LoadV2:
+  case NVPTXISD::LoadV4:
+    return combineUnpackingMovIntoLoad(N, DCI);
+  case ISD::MUL:
+    return PerformMULCombine(N, DCI, OptLevel);
+  case NVPTXISD::PRMT:
+    return combinePRMT(N, DCI, OptLevel);
+  case ISD::SETCC:
+    return PerformSETCCCombine(N, DCI, STI.getSmVersion());
+  case ISD::SHL:
+    return PerformSHLCombine(N, DCI, OptLevel);
+  case ISD::SREM:
+  case ISD::UREM:
+    return PerformREMCombine(N, DCI, OptLevel);
+  case NVPTXISD::StoreParam:
+  case NVPTXISD::StoreParamV2:
+  case NVPTXISD::StoreParamV4:
+    return PerformStoreParamCombine(N, DCI);
+  case ISD::STORE:
+  case NVPTXISD::StoreV2:
+  case NVPTXISD::StoreV4:
+    return PerformStoreCombine(N, DCI);
+  case ISD::VSELECT:
+    return PerformVSELECTCombine(N, DCI);
   }
   return SDValue();
 }
@@ -6315,10 +6439,12 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
 
   // Specialize for cmpxchg
   // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
+  SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
   if (isReleaseOrStronger(Ord))
-    return Ord == AtomicOrdering::SequentiallyConsistent
-               ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
-               : Builder.CreateFence(AtomicOrdering::Release);
+    return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
+                                   ? Ord
+                                   : AtomicOrdering::Release,
+                               SSID);
 
   return nullptr;
 }
@@ -6330,15 +6456,15 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
   if (!isa<AtomicCmpXchgInst>(Inst))
     return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
 
+  auto *CI = cast<AtomicCmpXchgInst>(Inst);
   auto CASWidth =
-      cast<IntegerType>(
-          dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
-          ->getBitWidth();
+      cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
+  SyncScope::ID SSID = CI->getSyncScopeID();
   // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
   if (isAcquireOrStronger(Ord) &&
       (Ord != AtomicOrdering::SequentiallyConsistent ||
        CASWidth < STI.getMinCmpXchgSizeInBits()))
-    return Builder.CreateFence(AtomicOrdering::Acquire);
+    return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
 
   return nullptr;
 }
@@ -6385,7 +6511,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
   ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   unsigned Mode = Op.getConstantOperandVal(3);
 
-  if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector)
+  if (!Selector)
     return;
 
   KnownBits AKnown = DAG.computeKnownBits(A, Depth);
@@ -6394,7 +6520,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known,
   // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
   KnownBits BitField = BKnown.concat(AKnown);
 
-  APInt SelectorVal = Selector->getAPIntValue();
+  APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
   for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) {
     APInt Sel = SelectorVal.extractBits(4, I * 4);
     unsigned Idx = Sel.getLoBits(3).getZExtValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index ecae03e77aa83..a5bb83dfadb84 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1453,18 +1453,33 @@ let hasSideEffects = false in {
                 (ins PrmtMode:$mode),
                 "prmt.b32$mode",
                 [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>;
+  def PRMT_B32rir
+  : BasicFlagsNVPTXInst<(outs B32:$d),
+              (ins B32:$a, i32imm:$b, B32:$c),
+              (ins PrmtMode:$mode),
+              "prmt.b32$mode",
+              [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
   def PRMT_B32rii
     : BasicFlagsNVPTXInst<(outs B32:$d),
                 (ins B32:$a, i32imm:$b, Hexu32imm:$c),
                 (ins PrmtMode:$mode),
                 "prmt.b32$mode",
                 [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>;
-  def PRMT_B32rir
+  def PRMT_B32irr
     : BasicFlagsNVPTXInst<(outs B32:$d),
-                (ins B32:$a, i32imm:$b, B32:$c),
-                (ins PrmtMode:$mode),
+                (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>;
+  def PRMT_B32iri
+    : BasicFlagsNVPTXInst<(outs B32:$d),
+                (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>;
+  def PRMT_B32iir
+    : BasicFlagsNVPTXInst<(outs B32:$d),
+                (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode),
                 "prmt.b32$mode",
-                [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
+                [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>;
 
 }
 
@@ -1608,8 +1623,8 @@ def ADDR : Operand<pAny> {
   let MIOperandInfo = (ops ADDR_base, i32imm);
 }
 
-def LdStCode : Operand<i32> {
-  let PrintMethod = "printLdStCode";
+def AtomicCode : Operand<i32> {
+  let PrintMethod = "printAtomicCode";
 }
 
 def MmaCode : Operand<i32> {
@@ -1962,7 +1977,7 @@ defm ProxyRegB64 : ProxyRegInst<"b64",  B64>;
 class LD<NVPTXRegClass regclass>
   : NVPTXInst<
     (outs regclass:$dst),
-    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
+    (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign,
          i32imm:$fromWidth, ADDR:$addr),
     "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth "
     "\t$dst, [$addr];", []>;
@@ -1978,7 +1993,7 @@ class ST<DAGOperand O>
   : NVPTXInst<
     (outs),
     (ins O:$src,
-         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$toWidth,
+         AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth,
          ADDR:$addr),
     "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth"
     " \t[$addr], $src;", []>;
@@ -1996,21 +2011,21 @@ let mayStore=1, hasSideEffects=0 in {
 multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
   def _v2 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
-         LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
+    (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+         AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
     "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2}}, [$addr];", []>;
   def _v4 : NVPTXInst<
     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
-         LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
+    (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+         AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
     "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   if support_v8 then
     def _v8 : NVPTXInst<
       (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
             regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
-      (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
+      (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign,
            i32imm:$fromWidth, ADDR:$addr),
       "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
       "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
@@ -2027,14 +2042,14 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
   def _v2 : NVPTXInst<
     (outs),
     (ins O:$src1, O:$src2,
-         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
+         AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
          ADDR:$addr),
     "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth "
     "\t[$addr], {{$src1, $src2}};", []>;
   def _v4 : NVPTXInst<
     (outs),
     (ins O:$src1, O:$src2, O:$src3, O:$src4,
-         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth,
+         AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
          ADDR:$addr),
     "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth "
     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
@@ -2043,7 +2058,7 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
       (outs),
       (ins O:$src1, O:$src2, O:$src3, O:$src4,
            O:$src5, O:$src6, O:$src7, O:$src8,
-           LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, 
+           AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
            ADDR:$addr),
       "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth "
       "\t[$addr], "
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 93827be5c2811..70150bdfc8d16 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -41,6 +41,46 @@ def AS_match {
   }];
 }
 
+
+//===----------------------------------------------------------------------===//
+// NVPTX Scope Constants
+// These map to the Scope enum in NVPTX.h
+//===----------------------------------------------------------------------===//
+
+def Scope_thread : PatLeaf<(i32 0)>;      // Thread = 0
+def Scope_cta : PatLeaf<(i32 1)>;         // Block = 1
+def Scope_cluster : PatLeaf<(i32 2)>;     // Cluster = 2
+def Scope_device : PatLeaf<(i32 3)>;      // Device = 3
+def Scope_sys : PatLeaf<(i32 4)>;         // System = 4
+
+//===----------------------------------------------------------------------===//
+// NVPTX Address Space Constants
+// These map to the AddressSpace enum in NVPTX.h
+//===----------------------------------------------------------------------===//
+
+def AddrSpace_gen : PatLeaf<(i32 0)>;        // Generic = 0
+def AddrSpace_global : PatLeaf<(i32 1)>;         // Global = 1
+def AddrSpace_shared : PatLeaf<(i32 3)>;         // Shared = 3
+def AddrSpace_const : PatLeaf<(i32 4)>;          // Const = 4
+def AddrSpace_local : PatLeaf<(i32 5)>;          // Local = 5
+def AddrSpace_shared_cluster : PatLeaf<(i32 7)>;  // SharedCluster = 7
+def AddrSpace_param : PatLeaf<(i32 101)>;        // Param = 101
+
+//===----------------------------------------------------------------------===//
+// NVPTX Ordering Constants
+// These map to the Ordering enum in NVPTX.h
+//===----------------------------------------------------------------------===//
+
+def Ordering_not_atomic : PatLeaf<(i32 0)>;           // NotAtomic = 0
+def Ordering_relaxed : PatLeaf<(i32 2)>;             // Relaxed = 1
+def Ordering_acquire : PatLeaf<(i32 4)>;             // Acquire = 4
+def Ordering_release : PatLeaf<(i32 5)>;             // Release = 5
+def Ordering_acquire_release : PatLeaf<(i32 6)>;      // AcquireRelease = 6
+def Ordering_sequentially_consistent : PatLeaf<(i32 7)>; // SequentiallyConsistent = 7
+def Ordering_volatile : PatLeaf<(i32 8)>;            // Volatile = 8
+def Ordering_relaxed_mmio : PatLeaf<(i32 9)>;         // RelaxedMMIO = 9
+
+
 // A node that will be replaced with the current PTX version.
 class PTX {
   SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -1007,24 +1047,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
 // MISC
 //
 
-class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
-    : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c),
-          (PRMT_B32rrr $a, $b, $c, prmt_mode)>;
-
-class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
-    : Pat<(prmt_intrinsic i32:$a, i32:$c),
-          (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>;
-
-def : PRMT3Pat<int_nvvm_prmt,      PrmtNONE>;
-def : PRMT3Pat<int_nvvm_prmt_f4e,  PrmtF4E>;
-def : PRMT3Pat<int_nvvm_prmt_b4e,  PrmtB4E>;
-
-def : PRMT2Pat<int_nvvm_prmt_rc8,  PrmtRC8>;
-def : PRMT2Pat<int_nvvm_prmt_ecl,  PrmtECL>;
-def : PRMT2Pat<int_nvvm_prmt_ecr,  PrmtECR>;
-def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>;
-
-
 def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32",
                              [(int_nvvm_nanosleep imm:$i)]>,
         Requires<[hasPTX<63>, hasSM<70>]>;
@@ -1860,35 +1882,50 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
   }
 }
 
-// has 3 operands
-multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
-                      SDPatternOperator op, list<Predicate> preds> {
-  defvar asm_str = "atom" # sem_str # as_str # "." # op_str;
+multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode atomic> {
+  defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str;
+
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-    def rr : BasicNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.RC:$b, t.RC:$c),
-      asm_str,
-      [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>,
-    Requires<preds>;
+    def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+      (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      asm_str>;
 
-    def ir : BasicNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.Imm:$b, t.RC:$c),
-      asm_str,
-      [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>,
-    Requires<preds>;
+    def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+      (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      asm_str>;
 
-    def ri : BasicNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.RC:$b, t.Imm:$c),
-      asm_str,
-      [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>,
-    Requires<preds>;
+    def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+      (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      asm_str>;
 
-    def ii : BasicNVPTXInst<(outs t.RC:$dst),
-      (ins ADDR:$addr, t.Imm:$b, t.Imm:$c),
-      asm_str,
-      [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>,
-    Requires<preds>;
+    def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+      (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+      asm_str>;
   }
+
+  defvar GetSem = SDNodeXForm<atomic, [{
+    return getI32Imm(getMemOrder(cast<MemSDNode>(N)), SDLoc(N));
+  }]>;
+
+  defvar GetScope = SDNodeXForm<atomic, [{
+    return getI32Imm(getAtomicScope(cast<MemSDNode>(N)), SDLoc(N));
+  }]>;
+
+  defvar GetAddSp = SDNodeXForm<atomic, [{
+    return getI32Imm(getAddrSpace(cast<MemSDNode>(N)), SDLoc(N));
+  }]>;
+
+  def : Pat<(op:$this addr:$addr, t.Ty:$b, t.Ty:$c),
+        (!cast<Instruction>(NAME # _rr) ADDR:$addr, t.Ty:$b, t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+
+  def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c),
+        (!cast<Instruction>(NAME # _ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+
+  def : Pat<(op:$this addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)),
+        (!cast<Instruction>(NAME # _ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+
+  def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)),
+        (!cast<Instruction>(NAME # _ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>;
 }
 
 multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, list<Predicate> preds = []> {
@@ -1899,14 +1936,6 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
   defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
 }
 
-multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
-  defvar frag_pat = (frag node:$a, node:$b, node:$c);
-  defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
-  defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
-  defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
-  defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
-}
-
 // atom_add
 defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS<I32RT, atomic_load_add_i32, "add.u32">;
 defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS<I64RT, atomic_load_add_i64, "add.u64">;
@@ -1951,23 +1980,12 @@ defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS<I64RT, atomic_load_xor_i64, "xor.b64",
 
 // Define atom.cas for all combinations of size x addrspace x memory order
 // supported in PTX *and* on the hardware.
-foreach t = [I32RT, I64RT] in {
-  foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
-    defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
-    defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);
-    // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
-    // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
-    // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
-    defm INT_PTX_ATOM_CAS_#t.Size#_#order
-      : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
-    defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
-      : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
-  }
+foreach t = [I16RT, I32RT, I64RT] in {
+    defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size);
+    defm INT_PTX_ATOM_CAS_#t.Size
+     : F_ATOMIC_3<t, ".cas.b"#t.Size, atomic_cmp_swap_pat, atomic_cmp_swap>;
 }
 
-// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
-
 // Support for scoped atomic operations.  Matches
 // int_nvvm_atomic_{op}_{space}_{type}_{scope}
 // and converts it into the appropriate instruction.
@@ -1991,19 +2009,6 @@ multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
                               # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
                        preds = Preds>;
 }
-multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
-                       string ScopeStr, string SpaceStr,
-                       RegTyInfo t, list<Predicate> Preds> {
-  defm "" : F_ATOMIC_3<t,
-                       as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
-                       sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
-                       op_str = OpStr # "." # TypeStr,
-                       op = !cast<Intrinsic>(
-                              "int_nvvm_atomic_" # OpStr
-                              # "_" # SpaceStr # "_" # IntTypeStr
-                              # !if(!empty(ScopeStr), "", "_" # ScopeStr)),
-                       preds = Preds>;
-}
 
 // Constructs variants for different scopes of atomic op.
 multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
@@ -2018,15 +2023,22 @@ multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
     }
   }
 }
-multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
-                       RegTyInfo t, list<Predicate> Preds> {
-   // No need to define ".gpu"-scoped atomics.  They do the same thing
-   // as the regular, non-scoped atomics defined elsewhere.
+
+multiclass F_ATOMIC_3_INTRINSIC_PATTERN<RegTyInfo t, string OpStr, string InstructionName> {
   foreach scope = ["cta", "sys"] in {
-    // For now we only need variants for generic space pointers.
     foreach space = ["gen"] in {
-      defm _#scope#space : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, scope, space,
-                         t, !listconcat(Preds, [hasAtomScope])>;
+      defvar intrinsic = !cast<SDPatternOperator>("int_nvvm_atomic_" # OpStr # "_" # space # "_i_" # scope);
+      def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)),
+            (!cast<Instruction>(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
+
+      def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c)),
+            (!cast<Instruction>(InstructionName # "_ir") ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
+
+      def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c))),
+            (!cast<Instruction>(InstructionName # "_ri") ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
+
+      def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c))),
+            (!cast<Instruction>(InstructionName # "_ii") ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>;
     }
   }
 }
@@ -2069,9 +2081,9 @@ multiclass ATOM2_incdec_impl<string OpStr> {
 
 // atom.cas
 multiclass ATOM3_cas_impl<string OpStr> {
-  defm _b16 : ATOM3S_impl<OpStr, "i", "b16", I16RT, []>;
-  defm _b32 : ATOM3S_impl<OpStr, "i", "b32", I32RT, []>;
-  defm _b64 : ATOM3S_impl<OpStr, "i", "b64", I64RT, []>;
+  defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN<I16RT, OpStr, "INT_PTX_ATOM_CAS_16">;
+  defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN<I32RT, OpStr, "INT_PTX_ATOM_CAS_32">;
+  defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN<I64RT, OpStr, "INT_PTX_ATOM_CAS_64">;
 }
 
 defm INT_PTX_SATOM_ADD  : ATOM2_add_impl<"add">;
@@ -2137,7 +2149,7 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", B32>;
 // during the lifetime of the kernel.
 
 class LDG_G<NVPTXRegClass regclass>
-  : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+  : NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
                "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
 
 def LD_GLOBAL_NC_i8  : LDG_G<B16>;
@@ -2150,19 +2162,19 @@ def LD_GLOBAL_NC_i64 : LDG_G<B64>;
 // Elementized vector ldg
 class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
             "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
 
 
 class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 
-            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
             "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
 class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
                   regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
-             (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+             (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src),
              "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 88d3eefcc521e..4eb452f398220 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -165,6 +165,8 @@ inline std::string ScopeToString(Scope S) {
     return "Cluster";
   case Scope::Device:
     return "Device";
+  case Scope::DefaultDevice:
+    return "DefaultDevice";
   }
   report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".",
                              static_cast<ScopeUnderlyingType>(S)));
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 53312e36fb9da..a5d3be40c5cfc 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -96,7 +96,7 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
   // determine the type of the relocation
   unsigned Type = 0;
   if (IsPCRel) {
-    switch (Fixup.getTargetKind()) {
+    switch (Fixup.getKind()) {
     default:
       llvm_unreachable("Unimplemented");
     case PPC::fixup_ppc_br24:
@@ -173,8 +173,9 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       break;
     }
   } else {
-    switch (Fixup.getTargetKind()) {
-      default: llvm_unreachable("invalid fixup kind!");
+    switch (Fixup.getKind()) {
+    default:
+      llvm_unreachable("invalid fixup kind!");
     case PPC::fixup_ppc_br24abs:
       Type = ELF::R_PPC_ADDR24;
       break;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index ee99cfc7d655d..2dbc31fce72c2 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -155,11 +155,10 @@ void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) {
   const MCExpr *SubExpr2 =
       MCBinaryExpr::createSub(CurrentLocationExpr, SubExpr, getContext());
 
-  MCDataFragment *DF = static_cast<MCDataFragment *>(LabelSym->getFragment());
-  assert(DF && "Expecting a valid data fragment.");
-  MCFixupKind FixupKind = static_cast<MCFixupKind>(FirstLiteralRelocationKind +
-                                                   ELF::R_PPC64_PCREL_OPT);
-  DF->addFixup(MCFixup::create(LabelSym->getOffset() - 8, SubExpr2, FixupKind));
+  MCFragment *F = LabelSym->getFragment();
+  F->addFixup(
+      MCFixup::create(LabelSym->getOffset() - 8, SubExpr2,
+                      FirstLiteralRelocationKind + ELF::R_PPC64_PCREL_OPT));
   emitLabel(CurrentLocation, Inst.getLoc());
 }
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 66f4aade380fa..a143d85f61ec2 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1199,6 +1199,14 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     addExpr(Inst, getImm(), isRV64Imm());
   }
 
+  void addSImm10UnsignedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int64_t Imm;
+    [[maybe_unused]] bool IsConstant = evaluateConstantImm(getImm(), Imm);
+    assert(IsConstant);
+    Inst.addOperand(MCOperand::createImm(SignExtend64<10>(Imm)));
+  }
+
   void addFPImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     if (isImm()) {
@@ -1650,6 +1658,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidSImm26:
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 25),
                                       (1 << 25) - 1);
+  // HACK: See comment before `BareSymbolQC_E_LI` in RISCVInstrInfoXqci.td.
+  case Match_InvalidBareSymbolQC_E_LI:
+    LLVM_FALLTHROUGH;
+  // END HACK
   case Match_InvalidBareSImm32:
     return generateImmOutOfRangeError(Operands, ErrorInfo,
                                       std::numeric_limits<int32_t>::min(),
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index b723958a6ff28..fa7bcfa0e8132 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -774,7 +774,8 @@ static constexpr FeatureBitset XTHeadGroup = {
     RISCV::FeatureVendorXTHeadVdot};
 
 static constexpr FeatureBitset XAndesGroup = {
-    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVBFHCvt,
+    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesBFHCvt,
+    RISCV::FeatureVendorXAndesVBFHCvt,
     RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH,
     RISCV::FeatureVendorXAndesVDot};
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 89a87798d71e4..f76f8b3060d2a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -76,12 +76,13 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_riscv_branch", 0, 32, 0},
       {"fixup_riscv_rvc_jump", 2, 11, 0},
       {"fixup_riscv_rvc_branch", 0, 16, 0},
+      {"fixup_riscv_rvc_imm", 0, 16, 0},
       {"fixup_riscv_call", 0, 64, 0},
       {"fixup_riscv_call_plt", 0, 64, 0},
 
       {"fixup_riscv_qc_e_branch", 0, 48, 0},
       {"fixup_riscv_qc_e_32", 16, 32, 0},
-      {"fixup_riscv_qc_abs20_u", 12, 20, 0},
+      {"fixup_riscv_qc_abs20_u", 0, 32, 0},
       {"fixup_riscv_qc_e_call_plt", 0, 48, 0},
 
       // Andes fixups
@@ -103,12 +104,13 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+                                                   const MCFixup &Fixup,
                                                    const MCValue &,
                                                    uint64_t Value,
                                                    bool Resolved) const {
   int64_t Offset = int64_t(Value);
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
 
   // Return true if the symbol is unresolved.
   if (!Resolved)
@@ -134,6 +136,10 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
     // For jump instructions the immediate must be in the range
     // [-1048576, 1048574]
     return Offset > 1048574 || Offset < -1048576;
+  case RISCV::fixup_riscv_rvc_imm:
+    // This fixup can never be emitted as a relocation, so always needs to be
+    // relaxed.
+    return true;
   }
 }
 
@@ -152,6 +158,18 @@ static unsigned getRelaxedOpcode(unsigned Opcode, ArrayRef<MCOperand> Operands,
     // This only relaxes one "step" - i.e. from C.J to JAL, not from C.J to
     // QC.E.J, because we can always relax again if needed.
     return RISCV::JAL;
+  case RISCV::C_LI:
+    if (!STI.hasFeature(RISCV::FeatureVendorXqcili))
+      break;
+    // We only need this because `QC.E.LI` can be compressed into a `C.LI`. This
+    // happens because the `simm6` MCOperandPredicate accepts bare symbols, and
+    // `QC.E.LI` is the only instruction that accepts bare symbols at parse-time
+    // and compresses to `C.LI`. `C.LI` does not itself accept bare symbols at
+    // parse time.
+    //
+    // If we have a bare symbol, we need to turn this back to a `QC.E.LI`, as we
+    // have no way to emit a relocation on a `C.LI` instruction.
+    return RISCV::QC_E_LI;
   case RISCV::JAL: {
     // We can only relax JAL if we have Xqcilb
     if (!STI.hasFeature(RISCV::FeatureVendorXqcilb))
@@ -240,6 +258,23 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
     Res.addOperand(Inst.getOperand(1));
     break;
   }
+  case RISCV::C_LI: {
+    // This should only be hit when trying to relax a `C.LI` into a `QC.E.LI`
+    // because the `C.LI` has a bare symbol. We cannot use
+    // `RISCVRVC::uncompress` because it will use decompression patterns. The
+    // `QC.E.LI` compression pattern to `C.LI` is compression-only (because we
+    // don't want `c.li` ever printed as `qc.e.li`, which might be done if the
+    // pattern applied to decompression), but that doesn't help much becuase
+    // `C.LI` with a bare symbol will decompress to an `ADDI` anyway (because
+    // `simm12`'s MCOperandPredicate accepts a bare symbol and that pattern
+    // comes first), and we still cannot emit an `ADDI` with a bare symbol.
+    assert(STI.hasFeature(RISCV::FeatureVendorXqcili) &&
+           "C.LI is only relaxable with Xqcili");
+    Res.setOpcode(getRelaxedOpcode(Inst.getOpcode(), Inst.getOperands(), STI));
+    Res.addOperand(Inst.getOperand(0));
+    Res.addOperand(Inst.getOperand(1));
+    break;
+  }
   case RISCV::BEQ:
   case RISCV::BNE:
   case RISCV::BLT:
@@ -267,14 +302,14 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
   Inst = std::move(Res);
 }
 
-bool RISCVAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
+bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F,
                                          bool &WasRelaxed) const {
   MCContext &C = getContext();
 
-  int64_t LineDelta = DF.getLineDelta();
-  const MCExpr &AddrDelta = DF.getAddrDelta();
+  int64_t LineDelta = F.getDwarfLineDelta();
+  const MCExpr &AddrDelta = F.getDwarfAddrDelta();
   SmallVector<MCFixup, 1> Fixups;
-  size_t OldSize = DF.getContents().size();
+  size_t OldSize = F.getVarSize();
 
   int64_t Value;
   [[maybe_unused]] bool IsAbsolute =
@@ -327,17 +362,16 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
     OS << uint8_t(dwarf::DW_LNS_copy);
   }
 
-  DF.setContents(Data);
-  DF.setFixups(Fixups);
+  F.setVarContents(Data);
+  F.setVarFixups(Fixups);
   WasRelaxed = OldSize != Data.size();
   return true;
 }
 
-bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
-                                    bool &WasRelaxed) const {
-  const MCExpr &AddrDelta = DF.getAddrDelta();
+bool RISCVAsmBackend::relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const {
+  const MCExpr &AddrDelta = F.getDwarfAddrDelta();
   SmallVector<MCFixup, 2> Fixups;
-  size_t OldSize = DF.getContents().size();
+  size_t OldSize = F.getVarSize();
 
   int64_t Value;
   if (AddrDelta.evaluateAsAbsolute(Value, *Asm))
@@ -349,9 +383,9 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
   assert(getContext().getAsmInfo()->getMinInstAlignment() == 1 &&
          "expected 1-byte alignment");
   if (Value == 0) {
-    DF.clearContents();
-    DF.clearFixups();
-    WasRelaxed = OldSize != DF.getContents().size();
+    F.clearVarContents();
+    F.clearVarFixups();
+    WasRelaxed = OldSize != 0;
     return true;
   }
 
@@ -382,20 +416,20 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
   } else {
     llvm_unreachable("unsupported CFA encoding");
   }
-  DF.setContents(Data);
-  DF.setFixups(Fixups);
+  F.setVarContents(Data);
+  F.setVarFixups(Fixups);
 
   WasRelaxed = OldSize != Data.size();
   return true;
 }
 
-std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF,
+std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCFragment &LF,
                                                    int64_t &Value) const {
-  if (LF.isSigned())
+  if (LF.isLEBSigned())
     return std::make_pair(false, false);
-  const MCExpr &Expr = LF.getValue();
+  const MCExpr &Expr = LF.getLEBValue();
   if (ULEB128Reloc) {
-    LF.addFixup(MCFixup::create(0, &Expr, FK_Data_leb128));
+    LF.setVarFixups({MCFixup::create(0, &Expr, FK_Data_leb128)});
   }
   return std::make_pair(Expr.evaluateKnownAbsolute(Value, *Asm), false);
 }
@@ -440,7 +474,7 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
                                  MCContext &Ctx) {
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case FK_Data_1:
@@ -539,10 +573,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
             (Bit5 << 2);
     return Value;
   }
+  case RISCV::fixup_riscv_rvc_imm: {
+    if (!isInt<6>(Value))
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    unsigned Bit5 = (Value >> 5) & 0x1;
+    unsigned Bit4_0 = Value & 0x1f;
+    Value = (Bit5 << 12) | (Bit4_0 << 2);
+    return Value;
+  }
   case RISCV::fixup_riscv_qc_e_32: {
     if (!isInt<32>(Value))
       Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
-    return ((Value & 0xffffffff) << 16);
+    return Value & 0xffffffffu;
   }
   case RISCV::fixup_riscv_qc_abs20_u: {
     if (!isInt<20>(Value))
@@ -620,14 +662,13 @@ static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr,
   const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym();
   if (!AUIPCSymbol)
     return nullptr;
-  const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment());
-
+  const auto *DF = AUIPCSymbol->getFragment();
   if (!DF)
     return nullptr;
 
   uint64_t Offset = AUIPCSymbol->getOffset();
   if (DF->getContents().size() == Offset) {
-    DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
+    DF = DF->getNext();
     if (!DF)
       return nullptr;
     Offset = 0;
@@ -636,7 +677,7 @@ static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr,
   for (const MCFixup &F : DF->getFixups()) {
     if (F.getOffset() != Offset)
       continue;
-    auto Kind = F.getTargetKind();
+    auto Kind = F.getKind();
     if (!mc::isRelocation(F.getKind())) {
       if (Kind == RISCV::fixup_riscv_pcrel_hi20) {
         *DFOut = DF;
@@ -664,7 +705,7 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &,
   const MCFixup *AUIPCFixup;
   const MCFragment *AUIPCDF;
   MCValue AUIPCTarget;
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     // Use default handling for `Value` and `IsResolved`.
     return {};
@@ -703,14 +744,14 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &,
   Value = Asm->getSymbolOffset(SA) + AUIPCTarget.getConstant();
   Value -= Asm->getFragmentOffset(*AUIPCDF) + AUIPCFixup->getOffset();
 
-  return AUIPCFixup->getTargetKind() == RISCV::fixup_riscv_pcrel_hi20 &&
+  return AUIPCFixup->getKind() == RISCV::fixup_riscv_pcrel_hi20 &&
          isPCRelFixupResolved(AUIPCTarget.getAddSym(), *AUIPCDF);
 }
 
 void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
                                           const MCFixup &Fixup) {
   StringRef VendorIdentifier;
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     // No Vendor Relocation Required.
     return;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 1f1a6f5fe31a0..8c10fbec3c8fc 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -60,7 +60,8 @@ class RISCVAsmBackend : public MCAsmBackend {
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override;
 
-  bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+  bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+                                    const MCValue &, uint64_t,
                                     bool) const override;
 
   std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
@@ -72,11 +73,9 @@ class RISCVAsmBackend : public MCAsmBackend {
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
 
-  bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
-                          bool &WasRelaxed) const override;
-  bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
-                     bool &WasRelaxed) const override;
-  std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF,
+  bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override;
+  bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override;
+  std::pair<bool, bool> relaxLEB128(MCFragment &LF,
                                     int64_t &Value) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index f41ad419db1a7..7ad5d5f3118b6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -339,7 +339,6 @@ enum OperandType : unsigned {
   OPERAND_SIMM6,
   OPERAND_SIMM6_NONZERO,
   OPERAND_SIMM10,
-  OPERAND_SIMM10_UNSIGNED,
   OPERAND_SIMM10_LSB0000_NONZERO,
   OPERAND_SIMM11,
   OPERAND_SIMM12,
@@ -495,6 +494,17 @@ inline static bool isValidRoundingMode(unsigned Mode) {
 }
 } // namespace RISCVVXRndMode
 
+namespace RISCVExceptFlags {
+enum ExceptionFlag {
+  NX = 0x01, // Inexact
+  UF = 0x02, // Underflow
+  OF = 0x04, // Overflow
+  DZ = 0x08, // Divide by zero
+  NV = 0x10, // Invalid operation
+  ALL = 0x1F // Mask for all accrued exception flags
+};
+}
+
 //===----------------------------------------------------------------------===//
 // Floating-point Immediates
 //
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 8ab2c56ae3178..9bf7896e1f1e2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -48,7 +48,7 @@ RISCVELFObjectWriter::~RISCVELFObjectWriter() = default;
 unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                             const MCValue &Target,
                                             bool IsPCRel) const {
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
   auto Spec = Target.getSpecifier();
   switch (Spec) {
   case ELF::R_RISCV_TPREL_HI20:
@@ -135,6 +135,9 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
     return ELF::R_RISCV_LO12_I;
   case RISCV::fixup_riscv_lo12_s:
     return ELF::R_RISCV_LO12_S;
+  case RISCV::fixup_riscv_rvc_imm:
+    reportError(Fixup.getLoc(), "No relocation for CI-type instructions");
+    return ELF::R_RISCV_NONE;
   case RISCV::fixup_riscv_qc_e_32:
     return ELF::R_RISCV_QC_E_32;
   case RISCV::fixup_riscv_qc_abs20_u:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index c1cdf511fae5b..f816561ccf3ff 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -40,12 +40,16 @@ enum Fixups {
   fixup_riscv_rvc_jump,
   // 8-bit fixup for symbol references in the compressed branch instruction
   fixup_riscv_rvc_branch,
+  // 6-bit fixup for symbol references in instructions like c.li
+  fixup_riscv_rvc_imm,
   // Fixup representing a legacy no-pic function call attached to the auipc
   // instruction in a pair composed of adjacent auipc+jalr instructions.
   fixup_riscv_call,
   // Fixup representing a function call attached to the auipc instruction in a
   // pair composed of adjacent auipc+jalr instructions.
   fixup_riscv_call_plt,
+
+  // Qualcomm specific fixups
   // 12-bit fixup for symbol references in the 48-bit Xqcibi branch immediate
   // instructions
   fixup_riscv_qc_e_branch,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 2ed7cd9f008a6..cbeabdddb9371 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -650,6 +650,8 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_rvc_jump;
     } else if (MIFrm == RISCVII::InstFormatCB) {
       FixupKind = RISCV::fixup_riscv_rvc_branch;
+    } else if (MIFrm == RISCVII::InstFormatCI) {
+      FixupKind = RISCV::fixup_riscv_rvc_imm;
     } else if (MIFrm == RISCVII::InstFormatI) {
       FixupKind = RISCV::fixup_riscv_12_i;
     } else if (MIFrm == RISCVII::InstFormatQC_EB) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index f66c2d5f99cb3..61ecfb278a7d3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include <bitset>
 
 #define GET_INSTRINFO_MC_DESC
@@ -305,6 +306,47 @@ class RISCVMCInstrAnalysis : public MCInstrAnalysis {
     }
   }
 
+  /// Returns (PLT virtual address, GOT virtual address) pairs for PLT entries.
+  std::vector<std::pair<uint64_t, uint64_t>>
+  findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
+                 const MCSubtargetInfo &STI) const override {
+    uint32_t LoadInsnOpCode;
+    if (const Triple &T = STI.getTargetTriple(); T.isRISCV64())
+      LoadInsnOpCode = 0x3003; // ld
+    else if (T.isRISCV32())
+      LoadInsnOpCode = 0x2003; // lw
+    else
+      return {};
+
+    constexpr uint64_t FirstEntryAt = 32, EntrySize = 16;
+    if (PltContents.size() < FirstEntryAt + EntrySize)
+      return {};
+
+    std::vector<std::pair<uint64_t, uint64_t>> Results;
+    for (uint64_t EntryStart = FirstEntryAt,
+                  EntryStartEnd = PltContents.size() - EntrySize;
+         EntryStart <= EntryStartEnd; EntryStart += EntrySize) {
+      const uint32_t AuipcInsn =
+          support::endian::read32le(PltContents.data() + EntryStart);
+      const bool IsAuipc = (AuipcInsn & 0x7F) == 0x17;
+      if (!IsAuipc)
+        continue;
+
+      const uint32_t LoadInsn =
+          support::endian::read32le(PltContents.data() + EntryStart + 4);
+      const bool IsLoad = (LoadInsn & 0x707F) == LoadInsnOpCode;
+      if (!IsLoad)
+        continue;
+
+      const uint64_t GotPltSlotVA = PltSectionVA + EntryStart +
+                                    (AuipcInsn & 0xFFFFF000) +
+                                    SignExtend64<12>(LoadInsn >> 20);
+      Results.emplace_back(PltSectionVA + EntryStart, GotPltSlotVA);
+    }
+
+    return Results;
+  }
+
 private:
   static bool maybeReturnAddress(MCRegister Reg) {
     // X1 is used for normal returns, X5 for returns from outlined functions.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index cb704f56d9ccd..f9c0b54be7a27 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1488,6 +1488,9 @@ def HasVendorXqcics
 def NoVendorXqcics
     : Predicate<"!Subtarget->hasVendorXqcics()">;
 
+def HasVendorXqcicsOrXqcicm
+    : Predicate<"Subtarget->hasVendorXqcics() || Subtarget->hasVendorXqcicm()">;
+
 def FeatureVendorXqcicsr
     : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
 def HasVendorXqcicsr
@@ -1599,6 +1602,14 @@ def HasVendorXAndesPerf
       AssemblerPredicate<(all_of FeatureVendorXAndesPerf),
                          "'XAndesPerf' (Andes Performance Extension)">;
 
+def FeatureVendorXAndesBFHCvt
+    : RISCVExtension<5, 0, "Andes Scalar BFLOAT16 Conversion Extension",
+                     [FeatureStdExtF]>;
+def HasVendorXAndesBFHCvt
+    : Predicate<"Subtarget->hasVendorXAndesBFHCvt()">,
+      AssemblerPredicate<(all_of FeatureVendorXAndesBFHCvt),
+                         "'XAndesBFHCvt' (Andes Scalar BFLOAT16 Conversion Extension)">;
+
 def FeatureVendorXAndesVBFHCvt
     : RISCVExtension<5, 0, "Andes Vector BFLOAT16 Conversion Extension",
                      [FeatureStdExtZve32f]>;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index a796c910bd449..6c8e3da80b932 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -738,7 +738,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
                                        MachineFunction &MF, uint64_t Offset,
                                        uint64_t RealStackSize, bool EmitCFI,
                                        bool NeedProbe, uint64_t ProbeSize,
-                                       bool DynAllocation) const {
+                                       bool DynAllocation,
+                                       MachineInstr::MIFlag Flag) const {
   DebugLoc DL;
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
@@ -748,7 +749,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   // Simply allocate the stack if it's not big enough to require a probe.
   if (!NeedProbe || Offset <= ProbeSize) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Offset),
-                  MachineInstr::FrameSetup, getStackAlign());
+                  Flag, getStackAlign());
 
     if (EmitCFI)
       CFIBuilder.buildDefCFAOffset(RealStackSize);
@@ -759,7 +760,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
     }
 
     return;
@@ -770,14 +771,13 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
     uint64_t CurrentOffset = 0;
     while (CurrentOffset + ProbeSize <= Offset) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                    StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
-                    getStackAlign());
+                    StackOffset::getFixed(-ProbeSize), Flag, getStackAlign());
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
 
       CurrentOffset += ProbeSize;
       if (EmitCFI)
@@ -787,8 +787,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
     uint64_t Residual = Offset - CurrentOffset;
     if (Residual) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                    StackOffset::getFixed(-Residual), MachineInstr::FrameSetup,
-                    getStackAlign());
+                    StackOffset::getFixed(-Residual), Flag, getStackAlign());
       if (EmitCFI)
         CFIBuilder.buildDefCFAOffset(Offset);
 
@@ -798,7 +797,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
             .addReg(RISCV::X0)
             .addReg(SPReg)
             .addImm(0)
-            .setMIFlags(MachineInstr::FrameSetup);
+            .setMIFlags(Flag);
       }
     }
 
@@ -812,8 +811,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   Register TargetReg = RISCV::X6;
   // SUB TargetReg, SP, RoundedSize
   RI->adjustReg(MBB, MBBI, DL, TargetReg, SPReg,
-                StackOffset::getFixed(-RoundedSize), MachineInstr::FrameSetup,
-                getStackAlign());
+                StackOffset::getFixed(-RoundedSize), Flag, getStackAlign());
 
   if (EmitCFI) {
     // Set the CFA register to TargetReg.
@@ -830,14 +828,14 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
 
   if (Residual) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
-                  MachineInstr::FrameSetup, getStackAlign());
+                  Flag, getStackAlign());
     if (DynAllocation) {
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
     }
   }
 
@@ -1034,7 +1032,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
       MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
   if (StackSize != 0)
     allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
-                  NeedProbe, ProbeSize, DynAllocation);
+                  NeedProbe, ProbeSize, DynAllocation,
+                  MachineInstr::FrameSetup);
 
   // Save SiFive CLIC CSRs into Stack
   emitSiFiveCLICPreemptibleSaves(MF, MBB, MBBI, DL);
@@ -1082,7 +1081,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
     allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
                   getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
-                  ProbeSize, DynAllocation);
+                  ProbeSize, DynAllocation, MachineInstr::FrameSetup);
   }
 
   if (RVVStackSize) {
@@ -1814,7 +1813,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
         bool DynAllocation =
             MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
         allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF),
-                      /*NeedProbe=*/true, ProbeSize, DynAllocation);
+                      /*NeedProbe=*/true, ProbeSize, DynAllocation,
+                      MachineInstr::NoFlags);
       } else {
         const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
         RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index d013755ce58a0..6af63a4885f35 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -81,7 +81,8 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineFunction &MF, uint64_t Offset,
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
-                     uint64_t ProbeSize, bool DynAllocation) const;
+                     uint64_t ProbeSize, bool DynAllocation,
+                     MachineInstr::MIFlag Flag) const;
 
 protected:
   const RISCVSubtarget &STI;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index c97b14a254cdc..cfec46d23d65b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -689,10 +689,16 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   if (!isShiftedMask_32(C1) || isInt<12>(C1))
     return false;
 
+  // INSBI will clobber the input register in N0. Bail out if we need a copy to
+  // preserve this value.
+  SDValue N0 = Node->getOperand(0);
+  if (!N0.hasOneUse())
+    return false;
+
   // If C1 is a shifted mask (but can't be formed as an ORI),
   // use a bitfield insert of -1.
   // Transform (or x, C1)
-  //        -> (qc.insbi x, width, shift)
+  //        -> (qc.insbi x, -1, width, shift)
   const unsigned Leading = llvm::countl_zero((uint32_t)C1);
   const unsigned Trailing = llvm::countr_zero((uint32_t)C1);
   const unsigned Width = 32 - Leading - Trailing;
@@ -705,7 +711,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
 
-  SDValue Ops[] = {CurDAG->getSignedTargetConstant(-1, DL, VT),
+  SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT),
                    CurDAG->getTargetConstant(Width, DL, VT),
                    CurDAG->getTargetConstant(Trailing, DL, VT)};
   SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
@@ -2842,56 +2848,6 @@ static bool isWorthFoldingAdd(SDValue Add) {
   return true;
 }
 
-bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
-                                              unsigned MaxShiftAmount,
-                                              SDValue &Base, SDValue &Index,
-                                              SDValue &Scale) {
-  EVT VT = Addr.getSimpleValueType();
-  auto UnwrapShl = [this, VT, MaxShiftAmount](SDValue N, SDValue &Index,
-                                              SDValue &Shift) {
-    uint64_t ShiftAmt = 0;
-    Index = N;
-
-    if (N.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N.getOperand(1))) {
-      // Only match shifts by a value in range [0, MaxShiftAmount].
-      if (N.getConstantOperandVal(1) <= MaxShiftAmount) {
-        Index = N.getOperand(0);
-        ShiftAmt = N.getConstantOperandVal(1);
-      }
-    }
-
-    Shift = CurDAG->getTargetConstant(ShiftAmt, SDLoc(N), VT);
-    return ShiftAmt != 0;
-  };
-
-  if (Addr.getOpcode() == ISD::ADD) {
-    if (auto *C1 = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
-      SDValue AddrB = Addr.getOperand(0);
-      if (AddrB.getOpcode() == ISD::ADD &&
-          UnwrapShl(AddrB.getOperand(0), Index, Scale) &&
-          !isa<ConstantSDNode>(AddrB.getOperand(1)) &&
-          isInt<12>(C1->getSExtValue())) {
-        // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
-        SDValue C1Val =
-            CurDAG->getTargetConstant(C1->getZExtValue(), SDLoc(Addr), VT);
-        Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
-                                              AddrB.getOperand(1), C1Val),
-                       0);
-        return true;
-      }
-    } else if (UnwrapShl(Addr.getOperand(0), Index, Scale)) {
-      Base = Addr.getOperand(1);
-      return true;
-    } else {
-      UnwrapShl(Addr.getOperand(1), Index, Scale);
-      Base = Addr.getOperand(0);
-      return true;
-    }
-  }
-
-  return false;
-}
-
 bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) {
   if (SelectAddrFrameIndex(Addr, Base, Offset))
@@ -2908,7 +2864,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
 
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
-    if (isInt<12>(CVal) && isInt<12>(CVal)) {
+    if (isInt<12>(CVal)) {
       Base = Addr.getOperand(0);
       if (Base.getOpcode() == RISCVISD::ADD_LO) {
         SDValue LoOperand = Base.getOperand(1);
@@ -2942,8 +2898,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
   // Handle ADD with large immediates.
   if (Addr.getOpcode() == ISD::ADD && isa<ConstantSDNode>(Addr.getOperand(1))) {
     int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
-    assert(!(isInt<12>(CVal) && isInt<12>(CVal)) &&
-           "simm12 not already handled?");
+    assert(!isInt<12>(CVal) && "simm12 not already handled?");
 
     // Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use
     // an ADDI for part of the offset and fold the rest into the load/store.
@@ -2984,12 +2939,11 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
   return true;
 }
 
-/// Similar to SelectAddrRegImm, except that the offset restricted for
-/// unsinged nine bits.
+/// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9.
 bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
                                           SDValue &Offset) {
-  if (SelectAddrFrameIndex(Addr, Base, Offset))
-    return true;
+  // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+  // a 9-bit immediate can be folded.
 
   SDLoc DL(Addr);
   MVT VT = Addr.getSimpleValueType();
@@ -2999,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
     if (isUInt<9>(CVal)) {
       Base = Addr.getOperand(0);
 
-      if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
+      // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+      // a 9-bit immediate can be folded.
       Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
       return true;
     }
@@ -3078,6 +3032,80 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
   return true;
 }
 
+bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr,
+                                              unsigned MaxShiftAmount,
+                                              SDValue &Base, SDValue &Index,
+                                              SDValue &Scale) {
+  if (Addr.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = Addr.getOperand(0);
+  SDValue RHS = Addr.getOperand(1);
+
+  EVT VT = Addr.getSimpleValueType();
+  auto SelectShl = [this, VT, MaxShiftAmount](SDValue N, SDValue &Index,
+                                              SDValue &Shift) {
+    if (N.getOpcode() != ISD::SHL || !isa<ConstantSDNode>(N.getOperand(1)))
+      return false;
+
+    // Only match shifts by a value in range [0, MaxShiftAmount].
+    unsigned ShiftAmt = N.getConstantOperandVal(1);
+    if (ShiftAmt > MaxShiftAmount)
+      return false;
+
+    Index = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShiftAmt, SDLoc(N), VT);
+    return true;
+  };
+
+  if (auto *C1 = dyn_cast<ConstantSDNode>(RHS)) {
+    // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2))
+    if (LHS.getOpcode() == ISD::ADD &&
+        !isa<ConstantSDNode>(LHS.getOperand(1)) &&
+        isInt<12>(C1->getSExtValue())) {
+      if (SelectShl(LHS.getOperand(1), Index, Scale)) {
+        SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+                                                  SDLoc(Addr), VT);
+        Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+                                              LHS.getOperand(0), C1Val),
+                       0);
+        return true;
+      }
+
+      // Add is commutative so we need to check both operands.
+      if (SelectShl(LHS.getOperand(0), Index, Scale)) {
+        SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(),
+                                                  SDLoc(Addr), VT);
+        Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT,
+                                              LHS.getOperand(1), C1Val),
+                       0);
+        return true;
+      }
+    }
+
+    // Don't match add with constants.
+    // FIXME: Is this profitable for large constants that have 0s in the lower
+    // 12 bits that we can materialize with LUI?
+    return false;
+  }
+
+  // Try to match a shift on the RHS.
+  if (SelectShl(RHS, Index, Scale)) {
+    Base = LHS;
+    return true;
+  }
+
+  // Try to match a shift on the LHS.
+  if (SelectShl(LHS, Index, Scale)) {
+    Base = RHS;
+    return true;
+  }
+
+  Base = LHS;
+  Index = RHS;
+  Scale = CurDAG->getTargetConstant(0, SDLoc(Addr), VT);
+  return true;
+}
+
 bool RISCVDAGToDAGISel::SelectAddrRegReg(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) {
   if (Addr.getOpcode() != ISD::ADD)
@@ -3776,21 +3804,18 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
 // Select a constant that can be represented as (sign_extend(imm5) << imm2).
 bool RISCVDAGToDAGISel::selectSimm5Shl2(SDValue N, SDValue &Simm5,
                                         SDValue &Shl2) {
-  if (auto *C = dyn_cast<ConstantSDNode>(N)) {
-    int64_t Offset = C->getSExtValue();
-    unsigned Shift;
-    for (Shift = 0; Shift < 4; Shift++)
-      if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0))
-        break;
-
-    // Constant cannot be encoded.
-    if (Shift == 4)
-      return false;
+  auto *C = dyn_cast<ConstantSDNode>(N);
+  if (!C)
+    return false;
 
-    EVT Ty = N->getValueType(0);
-    Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), Ty);
-    Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), Ty);
-    return true;
+  int64_t Offset = C->getSExtValue();
+  for (unsigned Shift = 0; Shift < 4; Shift++) {
+    if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0)) {
+      EVT VT = N->getValueType(0);
+      Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), VT);
+      Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), VT);
+      return true;
+    }
   }
 
   return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 359e529df46f3..4845a9c84e01f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -39,7 +39,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/Support/CommandLine.h"
@@ -129,7 +128,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   if (Subtarget.hasStdExtZfhmin())
     addRegisterClass(MVT::f16, &RISCV::FPR16RegClass);
-  if (Subtarget.hasStdExtZfbfmin())
+  if (Subtarget.hasStdExtZfbfmin() || Subtarget.hasVendorXAndesBFHCvt())
     addRegisterClass(MVT::bf16, &RISCV::FPR16RegClass);
   if (Subtarget.hasStdExtF())
     addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
@@ -656,6 +655,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::GET_FPENV, XLenVT, Custom);
     setOperationAction(ISD::SET_FPENV, XLenVT, Custom);
     setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom);
+    setOperationAction(ISD::GET_FPMODE, XLenVT, Custom);
+    setOperationAction(ISD::SET_FPMODE, XLenVT, Custom);
+    setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
   }
 
   setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool,
@@ -8226,6 +8228,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerSET_FPENV(Op, DAG);
   case ISD::RESET_FPENV:
     return lowerRESET_FPENV(Op, DAG);
+  case ISD::GET_FPMODE:
+    return lowerGET_FPMODE(Op, DAG);
+  case ISD::SET_FPMODE:
+    return lowerSET_FPMODE(Op, DAG);
+  case ISD::RESET_FPMODE:
+    return lowerRESET_FPMODE(Op, DAG);
   case ISD::EH_DWARF_CFA:
     return lowerEH_DWARF_CFA(Op, DAG);
   case ISD::VP_MERGE:
@@ -11969,7 +11977,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
 
   // Store with unit-stride store and load it back with segmented load.
   MVT XLenVT = Subtarget.getXLenVT();
-  SDValue VL = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget).second;
+  auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
   SDValue Passthru = DAG.getUNDEF(ConcatVT);
 
   // Allocate a stack slot.
@@ -11990,16 +11998,20 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
       MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer());
 
   static const Intrinsic::ID VlsegIntrinsicsIds[] = {
-      Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, Intrinsic::riscv_vlseg4,
-      Intrinsic::riscv_vlseg5, Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-      Intrinsic::riscv_vlseg8};
+      Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+      Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+      Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+      Intrinsic::riscv_vlseg8_mask};
 
   SDValue LoadOps[] = {
       Chain,
       DAG.getTargetConstant(VlsegIntrinsicsIds[Factor - 2], DL, XLenVT),
       Passthru,
       StackPtr,
+      Mask,
       VL,
+      DAG.getTargetConstant(
+          RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT),
       DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()), DL, XLenVT)};
 
   unsigned Sz =
@@ -12051,7 +12063,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
   }
 
   MVT XLenVT = Subtarget.getXLenVT();
-  SDValue VL = DAG.getRegister(RISCV::X0, XLenVT);
+  auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
 
   // If the VT is larger than LMUL=8, we need to split and reassemble.
   if ((VecVT.getSizeInBits().getKnownMinValue() * Factor) >
@@ -12100,10 +12112,10 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
     auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
 
     static const Intrinsic::ID IntrIds[] = {
-        Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
-        Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
-        Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
-        Intrinsic::riscv_vsseg8,
+        Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+        Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+        Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+        Intrinsic::riscv_vsseg8_mask,
     };
 
     unsigned Sz =
@@ -12119,6 +12131,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
                      DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT),
                      StoredVal,
                      StackPtr,
+                     Mask,
                      VL,
                      DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()),
                                            DL, XLenVT)};
@@ -13998,6 +14011,52 @@ SDValue RISCVTargetLowering::lowerRESET_FPENV(SDValue Op,
                      EnvValue);
 }
 
+const uint64_t ModeMask64 = ~RISCVExceptFlags::ALL;
+const uint32_t ModeMask32 = ~RISCVExceptFlags::ALL;
+
+SDValue RISCVTargetLowering::lowerGET_FPMODE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+  SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other);
+  SDValue Result = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo);
+  Chain = Result.getValue(1);
+  return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+SDValue RISCVTargetLowering::lowerSET_FPMODE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32;
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue EnvValue = Op->getOperand(1);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+  SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT);
+
+  EnvValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, EnvValue);
+  EnvValue = DAG.getNode(ISD::AND, DL, XLenVT, EnvValue, ModeMask);
+  Chain = DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo,
+                      ModeMask);
+  return DAG.getNode(RISCVISD::SET_CSR, DL, MVT::Other, Chain, SysRegNo,
+                     EnvValue);
+}
+
+SDValue RISCVTargetLowering::lowerRESET_FPMODE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32;
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+  SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT);
+
+  return DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo,
+                     ModeMask);
+}
+
 SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 00e969056df7d..e0a8c07b4206e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -429,7 +429,7 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool fallBackToDAGISel(const Instruction &Inst) const override;
 
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
@@ -437,14 +437,12 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
-  bool lowerDeinterleaveIntrinsicToLoad(
-      LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override;
+  bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
+                                        IntrinsicInst *DI) const override;
 
   bool lowerInterleaveIntrinsicToStore(
-      StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
-
-  bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                              ArrayRef<Value *> DeinterleaveRes) const override;
+      Instruction *Store, Value *Mask,
+      ArrayRef<Value *> InterleaveValues) const override;
 
   bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
                                ArrayRef<Value *> InterleaveOps) const override;
@@ -562,6 +560,9 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index b6b64b57b1b3e..e23001a3a0bff 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -193,7 +193,9 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   let AsmString = opcodestr # !if(!empty(argstr), "", "\t" # argstr);
   let Pattern = pattern;
 
-  let TSFlags{4-0} = format.Value;
+  InstFormat Format = format;
+
+  let TSFlags{4-0} = Format.Value;
 
   // Defaults
   RISCVVConstraint RVVConstraint = NoConstraint;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index c7019a65ec30c..64f9e3eb8d86f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2806,7 +2806,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         CASE_OPERAND_UIMM(7)
         CASE_OPERAND_UIMM(8)
         CASE_OPERAND_UIMM(9)
-	CASE_OPERAND_UIMM(10)
+        CASE_OPERAND_UIMM(10)
         CASE_OPERAND_UIMM(12)
         CASE_OPERAND_UIMM(16)
         CASE_OPERAND_UIMM(20)
@@ -4812,6 +4812,8 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
     return true;
   if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
     return true;
+  if (LHS.isImm() && LHS.getImm() == 0)
+    return true;
   if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
     return false;
   if (!LHS.isImm() || !RHS.isImm())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index f63531a0109b0..653607827282e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -120,6 +120,20 @@ def riscv_swap_csr  : RVSDNode<"SWAP_CSR",
                                                     SDTCisInt<2>]>,
                                [SDNPHasChain]>;
 
+// Clear bits of CSR. The first operand is the address of the required CSR,
+// the second is the bitmask of cleared bits.
+def riscv_clear_csr  : RVSDNode<"CLEAR_CSR",
+                               SDTypeProfile<0, 2, [SDTCisInt<0>,
+                                                    SDTCisInt<1>]>,
+                               [SDNPHasChain]>;
+
+// Set bits of CSR. The first operand is the address of the required CSR,
+// the second is the bitmask of bits to set.
+def riscv_set_csr  : RVSDNode<"SET_CSR",
+                               SDTypeProfile<0, 2, [SDTCisInt<0>,
+                                                    SDTCisInt<1>]>,
+                               [SDNPHasChain]>;
+
 // A read of the 64-bit counter CSR on a 32-bit target (returns (Lo, Hi)).
 // It takes a chain operand and another two target constant operands (the
 // CSR numbers of the low and high parts of the counter).
@@ -2038,6 +2052,42 @@ class SwapSysRegImm<SysReg SR, list<Register> Regs>
   let Defs = Regs;
 }
 
+class ClearSysReg<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins GPR:$val),
+           [(riscv_clear_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>,
+    PseudoInstExpansion<(CSRRC X0, SR.Encoding, GPR:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
+class ClearSysRegImm<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins uimm5:$val),
+           [(riscv_clear_csr (XLenVT SR.Encoding), uimm5:$val)]>,
+    PseudoInstExpansion<(CSRRCI X0, SR.Encoding, uimm5:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
+class SetSysReg<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins GPR:$val),
+           [(riscv_set_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>,
+    PseudoInstExpansion<(CSRRS X0, SR.Encoding, GPR:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
+class SetSysRegImm<SysReg SR, list<Register> Regs>
+  : Pseudo<(outs), (ins uimm5:$val),
+           [(riscv_set_csr (XLenVT SR.Encoding), uimm5:$val)]>,
+    PseudoInstExpansion<(CSRRSI X0, SR.Encoding, uimm5:$val)> {
+  let hasSideEffects = 0;
+  let Uses = Regs;
+  let Defs = Regs;
+}
+
 def ReadFRM : ReadSysReg<SysRegFRM, [FRM]>;
 let hasPostISelHook = 1 in {
 def WriteFRM : WriteSysReg<SysRegFRM, [FRM]>;
@@ -2056,6 +2106,10 @@ let hasPostISelHook = 1 in {
 def ReadFCSR : ReadSysReg<SysRegFCSR, [FRM, FFLAGS]>;
 def WriteFCSR : WriteSysReg<SysRegFCSR, [FRM, FFLAGS]>;
 def WriteFCSRImm : WriteSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
+def ClearFCSR : ClearSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def ClearFCSRImm : ClearSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
+def SetFCSR : SetSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def SetFCSRImm : SetSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
 }
 
 /// Other pseudo-instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index aa9e7b5635def..aef410fb4cc6e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -20,18 +20,22 @@
 
 def simm10 : RISCVSImmLeafOp<10>;
 
+def SImm10UnsignedAsmOperand : SImmAsmOperand<10, "Unsigned"> {
+  let RenderMethod = "addSImm10UnsignedOperands";
+}
+
 // A 10-bit signed immediate allowing range [-512, 1023]
-// but will decode to [-512, 511].
+// but represented as [-512, 511].
 def simm10_unsigned : RISCVOp {
-  let ParserMatchClass = SImmAsmOperand<10, "Unsigned">;
+  let ParserMatchClass = SImm10UnsignedAsmOperand;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeSImmOperand<10>";
-  let OperandType = "OPERAND_SIMM10_UNSIGNED";
+  let OperandType = "OPERAND_SIMM10";
   let MCOperandPredicate = [{
     int64_t Imm;
     if (!MCOp.evaluateAsConstantImm(Imm))
       return false;
-    return isInt<10>(Imm) || isUInt<10>(Imm);
+    return isInt<10>(Imm);
   }];
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index ec38201cd28c6..5220815336441 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -348,6 +348,17 @@ class NDSRVInstSDGP<bits<3> funct3, string opcodestr>
   let mayStore = 1;
 }
 
+class NDSRVInstBFHCvt<bits<7> funct7, bits<5> rs1val, DAGOperand rdty,
+                      DAGOperand rs2ty, string opcodestr>
+    : RVInstR<funct7, 0b100, OPC_CUSTOM_2, (outs rdty:$rd),
+              (ins rs2ty:$rs2), opcodestr, "$rd, $rs2"> {
+  let rs1 = rs1val;
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let mayRaiseFPException = 1;
+}
+
 class NDSRVInstVFPMAD<bits<6> funct6, string opcodestr>
     : RVInst<(outs VR:$vd), (ins VR:$vs2, FPR32:$rs1, VMaskOp:$vm),
              opcodestr # "." # "vf", "$vd, $rs1, $vs2$vm", [], InstFormatR>,
@@ -630,6 +641,19 @@ def NDS_LDGP  : NDSRVInstLDGP<0b011, "nds.ldgp">;
 def NDS_SDGP  : NDSRVInstSDGP<0b111, "nds.sdgp">;
 } // Predicates = [HasVendorXAndesPerf, IsRV64]
 
+//===----------------------------------------------------------------------===//
+// XAndesBFHCvt
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXAndesBFHCvt] in {
+def NDS_FCVT_S_BF16 : NDSRVInstBFHCvt<0b0000000, 0b00010,
+                                      FPR32, FPR16, "nds.fcvt.s.bf16">,
+                      Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]>;
+def NDS_FCVT_BF16_S : NDSRVInstBFHCvt<0b0000000, 0b00011,
+                                      FPR16, FPR32, "nds.fcvt.bf16.s">,
+                      Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
+}
+
 //===----------------------------------------------------------------------===//
 // XAndesVBFHCvt
 //===----------------------------------------------------------------------===//
@@ -743,6 +767,13 @@ def : Sh2AddPat<NDS_LEA_W_ZE>;
 def : Sh3AddPat<NDS_LEA_D_ZE>;
 } // Predicates = [HasVendorXAndesPerf, IsRV64]
 
+let Predicates = [HasVendorXAndesBFHCvt] in {
+def : Pat<(fpextend (bf16 FPR16:$rs)),
+          (NDS_FCVT_S_BF16 (bf16 FPR16:$rs))>;
+def : Pat<(bf16 (fpround FPR32:$rs)),
+          (NDS_FCVT_BF16_S FPR32:$rs)>;
+} // Predicates = [HasVendorXAndesBFHCvt]
+
 let Predicates = [HasVendorXAndesVBFHCvt] in {
 defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16;
 defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S;
@@ -801,13 +832,13 @@ defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotsu", "PseudoNDS_VD4DOTSU",
 let Predicates = [HasShortForwardBranchOpt], hasSideEffects = 0,
     mayLoad = 0, mayStore = 0, Size = 8, Constraints = "$dst = $falsev" in {
 def PseudoCCNDS_BFOS : Pseudo<(outs GPR:$dst),
-                              (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+                              (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
                                GPR:$falsev, GPR:$rs1,
                                uimmlog2xlen:$msb, uimmlog2xlen:$lsb), []>,
                        Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
                               ReadSFBALU]>;
 def PseudoCCNDS_BFOZ : Pseudo<(outs GPR:$dst),
-                              (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc,
+                              (ins GPR:$lhs, GPR:$rhs, cond_code:$cc,
                                GPR:$falsev, GPR:$rs1,
                                uimmlog2xlen:$msb, uimmlog2xlen:$lsb), []>,
                        Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 0f222988114b8..c7cb6e237aeac 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -845,10 +845,11 @@ let Predicates = [HasVendorXqcibi, IsRV32] in {
 let Predicates = [HasVendorXqcibm, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_INSBRI : QCIRVInstRI<0b1, simm11, "qc.insbri">;
-  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
-                             (ins simm5:$imm5, uimm5_plus1:$width,
+  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+                             (ins GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width,
                              uimm5:$shamt), "qc.insbi",
                              "$rd, $imm5, $width, $shamt"> {
+    let Constraints = "$rd = $rd_wb";
     bits<5> imm5;
     bits<5> shamt;
     bits<5> width;
@@ -1336,6 +1337,22 @@ class QCISELECTIICCPat<CondCode Cond, QCISELECTIICC Inst>
     : Pat<(select (i32 (setcc (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs1), Cond)), simm5:$simm1, simm5:$simm2),
           (Inst GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2)>;
 
+class QCILICCPat<CondCode Cond, QCILICC Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), (XLenVT GPRNoX0:$rs2), Cond)), simm5:$simm, (XLenVT GPRNoX0:$rd)),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm)>;
+
+class QCILICCPatInv<CondCode Cond, QCILICC Inst>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), (XLenVT GPRNoX0:$rs2), Cond)), (XLenVT GPRNoX0:$rd), simm5:$simm),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm)>;
+
+class QCILICCIPat<CondCode Cond, QCILICC Inst, DAGOperand InTyImm>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), InTyImm:$imm, Cond)), simm5:$simm, (XLenVT GPRNoX0:$rd)),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, simm5:$simm)>;
+
+class QCILICCIPatInv<CondCode Cond, QCILICC Inst, DAGOperand InTyImm>
+    : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), InTyImm:$imm, Cond)), (XLenVT GPRNoX0:$rd), simm5:$simm),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, simm5:$simm)>;
+
 // Match `riscv_brcc` and lower to the appropriate XQCIBI branch instruction.
 class BcciPat<CondCode Cond, QCIBranchInst_rii Inst, DAGOperand InTyImm>
     : Pat<(riscv_brcc (i32 GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12),
@@ -1498,6 +1515,36 @@ def : QCIMVCCIPat <SETEQ,  QC_MVEQI, simm5>;
 def : QCIMVCCIPat <SETNE,  QC_MVNEI, simm5>;
 }
 
+let Predicates = [HasVendorXqcicli, HasVendorXqcicsOrXqcicm, IsRV32] in {
+def : QCILICCPat <SETEQ,  QC_LIEQ>;
+def : QCILICCPat <SETNE,  QC_LINE>;
+def : QCILICCPat <SETLT,  QC_LILT>;
+def : QCILICCPat <SETGE,  QC_LIGE>;
+def : QCILICCPat <SETULT, QC_LILTU>;
+def : QCILICCPat <SETUGE, QC_LIGEU>;
+
+def : QCILICCIPat <SETEQ,  QC_LIEQI, simm5>;
+def : QCILICCIPat <SETNE,  QC_LINEI, simm5>;
+def : QCILICCIPat <SETLT,  QC_LILTI, simm5>;
+def : QCILICCIPat <SETGE,  QC_LIGEI, simm5>;
+def : QCILICCIPat <SETULT, QC_LILTUI, uimm5>;
+def : QCILICCIPat <SETUGE, QC_LIGEUI, uimm5>;
+
+def : QCILICCPatInv <SETNE,  QC_LIEQ>;
+def : QCILICCPatInv <SETEQ,  QC_LINE>;
+def : QCILICCPatInv <SETGE,  QC_LILT>;
+def : QCILICCPatInv <SETLT,  QC_LIGE>;
+def : QCILICCPatInv <SETUGE, QC_LILTU>;
+def : QCILICCPatInv <SETULT, QC_LIGEU>;
+
+def : QCILICCIPatInv <SETNE,  QC_LIEQI, simm5>;
+def : QCILICCIPatInv <SETEQ,  QC_LINEI, simm5>;
+def : QCILICCIPatInv <SETGE,  QC_LILTI, simm5>;
+def : QCILICCIPatInv <SETLT,  QC_LIGEI, simm5>;
+def : QCILICCIPatInv <SETUGE, QC_LILTUI, uimm5>;
+def : QCILICCIPatInv <SETULT, QC_LIGEUI, uimm5>;
+}
+
 let Predicates = [HasVendorXqcics, IsRV32] in {
 def : Pat<(select (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs2),(i32 GPRNoX0:$rs3)),
           (QC_SELECTNEI GPRNoX0:$rd, (i32 0), GPRNoX0:$rs2, GPRNoX0:$rs3)>;
@@ -1638,6 +1685,24 @@ def : CompressPat<(QC_E_ADDAI X2, simm10_lsb0000nonzero:$imm),
                   (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
 def : CompressPat<(QC_E_ADDI X2, X2, simm10_lsb0000nonzero:$imm),
                   (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>;
+
+def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+                  (ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+def : CompressPat<(QC_E_ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+                  (ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+def : CompressPat<(QC_E_ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+                  (ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+def : CompressPat<(QC_E_XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm),
+                  (XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>;
+
+def : CompressPat<(QC_E_ADDAI GPRNoX0:$rd, simm12:$imm),
+                  (ADDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
+def : CompressPat<(QC_E_ANDAI GPRNoX0:$rd, simm12:$imm),
+                  (ANDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
+def : CompressPat<(QC_E_ORAI GPRNoX0:$rd, simm12:$imm),
+                  (ORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
+def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12:$imm),
+                  (XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>;
 } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32]
 
 let Predicates = [HasVendorXqciac, IsRV32] in {
@@ -1659,3 +1724,82 @@ def : CompressPat<(QC_E_BGEUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0
 def : CompressPat<(QC_E_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12),
                   (QC_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>;
 } // let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32]
+
+// HACKS
+// -----
+// The reasons for needing the definitions below are long and quite annoying. I'm writing
+// this so they are explained in-line, rather than anywhere else.
+//
+// Emitting an instruction to an object proceeds as:
+// - Compression (in emitInstruction)
+// - Emit to Binary Code + Fixups
+// - Assembler Relaxation
+//   - Fixup evaluation/application
+//   - If relaxed, re-emitted to Binary + Fixups
+// - Relocation generation from Fixups
+//
+// Unfortunately, the `QC.E.LI` -> `C.LI` compression pattern has an edge case that has
+// caused crashes in the past.
+//
+// How the bug happens is:
+// - QC.E.LI is parsed with a bare symbol, which is valid + expected, and can
+//   be handled by fixups/relocations.
+// - Compression turns this into a `C.LI` because the `simm6` 
+//   MCOperandPredicate accepts bare symbols.
+// - Binary Code emission didn't know how to create a fixup for a CI-type
+//   instruction containing a bare symbol.
+//
+// The solution to the last bullet is that we added the `fixup_riscv_rvc_imm`,
+// so that we could proceed past the last error, and then use Assembler Relaxation
+// to turn the `C.LI` with a bare symbol back into a `QC.E.LI`.
+//
+// This is good enough for emitting objects, but doesn't work for emitting
+// assembly. Emitting assembly is why we need the following Hacks.
+// 
+// Emitting an instruction to assembly proceeds as:
+// - Compression (in emitInstruction)
+// - Decompression (in RISCVInstPrinter::printInst)
+// - InstAliases are applied
+//
+// So in the case of `QC.E.LI` with a bare symbol, first it is compressed to
+// `C.LI` with a bare symbol, and then it is decompressed to `ADDI` with a bare
+// symbol for printing, which is printed via an alias as `li <reg>, <symbol>`.
+// Both the decompression and the alias use the MCOperandPredicate from 
+// `simm12`, which accepts bare symbols.
+//
+// The problem here is that `li <reg>, <symbol>` fails to parse, because the
+// parsers do not accept bare symbols, they only accept symbols with specifiers
+// or immediates.
+//
+// Our solution is to add another alias, which will be prioritised above the
+// `li` alias, but only when `qc.e.li` is available. We originally intended to
+// use the `bare_symbol` Operand type, but this had no MCOperandPredicate, and
+// adding one changed the error messages when parsing `qc.e.li` with a
+// too-large constant. So instead, we add a new `AsmOperand` and `Operand` type,
+// just for the alias, which parse just like a BareSymbol, but they
+// have both an MCOperandPredicate, and the error message that corresponds to
+// the existing one on `qc.e.li` for too-large immediates (which fail to parse
+// as both an immediate, and a bare symbol).
+//
+// This is fairly unpleasant, but it's the least disruptive thing we can do
+// and keeps all the hacks confined to the RISC-V backend code.
+
+def BareSymbolQC_E_LI : AsmOperandClass {
+  let Name = "BareSymbolQC_E_LI";
+  let PredicateMethod = "isBareSymbol";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidBareSymbolQC_E_LI";
+  let ParserMethod = "parseBareSymbol";
+}
+
+def hack_bare_symbol_qc_e_li : Operand<XLenVT> {
+  let ParserMatchClass = BareSymbolQC_E_LI;
+  let MCOperandPredicate = [{
+    return MCOp.isExpr() && MCOp.isBareSymbolRef();
+  }];
+}
+
+let Predicates = [HasVendorXqcili, IsRV32] in {
+def : InstAlias<"qc.e.li $rd, $sym", (ADDI GPR:$rd, X0, hack_bare_symbol_qc_e_li:$sym), 3>;
+} // Predicates = [HasVendorXqcili, IsRV32]
+// END HACKS
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
index 878b85b141578..0723b2f568a79 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td
@@ -41,6 +41,7 @@ class Prefetch_ri<bits<5> optype, string opcodestr>
               opcodestr, "${imm12}(${rs1})"> {
   let Inst{11-7} = 0b00000;
   let rs2 = optype;
+  let Format = InstFormatOther; // this does not follow the normal S format.
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index a6ff22c4b391f..38cc0ce00a352 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -14,6 +14,7 @@
 #include "RISCVISelLowering.h"
 #include "RISCVSubtarget.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -68,6 +69,39 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
     Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
     Intrinsic::riscv_vlseg8_mask};
 
+static const Intrinsic::ID FixedVssegIntrIds[] = {
+    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+    Intrinsic::riscv_seg8_store_mask};
+
+static const Intrinsic::ID ScalableVssegIntrIds[] = {
+    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+    Intrinsic::riscv_vsseg8_mask};
+
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+  assert(N);
+  if (N == 1)
+    return true;
+
+  using namespace PatternMatch;
+  // Right now we're only recognizing the simplest pattern.
+  uint64_t C;
+  if (match(V, m_CombineOr(m_ConstantInt(C),
+                           m_NUWMul(m_Value(), m_ConstantInt(C)))) &&
+      C && C % N == 0)
+    return true;
+
+  if (isPowerOf2_32(N)) {
+    KnownBits KB = llvm::computeKnownBits(V, DL);
+    return KB.countMinTrailingZeros() >= Log2_32(N);
+  }
+
+  return false;
+}
+
 /// Lower an interleaved load into a vlsegN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -81,21 +115,49 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool RISCVTargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Indices.size() == Shuffles.size());
 
-  IRBuilder<> Builder(LI);
-
-  const DataLayout &DL = LI->getDataLayout();
+  IRBuilder<> Builder(Load);
 
+  const DataLayout &DL = Load->getDataLayout();
   auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
-  if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
-                                    LI->getPointerAddressSpace(), DL))
-    return false;
+  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+
+  Value *Ptr, *VL;
+  Align Alignment;
+  if (auto *LI = dyn_cast<LoadInst>(Load)) {
+    assert(LI->isSimple());
+    Ptr = LI->getPointerOperand();
+    Alignment = LI->getAlign();
+    assert(!Mask && "Unexpected mask on a load\n");
+    Mask = Builder.getAllOnesMask(VTy->getElementCount());
+    VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
+  } else {
+    auto *VPLoad = cast<VPIntrinsic>(Load);
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
+           "Unexpected intrinsic");
+    Ptr = VPLoad->getMemoryPointerParam();
+    Alignment = VPLoad->getPointerAlignment().value_or(
+        DL.getABITypeAlign(VTy->getElementType()));
+
+    assert(Mask && "vp.load needs a mask!");
+
+    Value *WideEVL = VPLoad->getVectorLengthParam();
+    // Conservatively check if EVL is a multiple of factor, otherwise some
+    // (trailing) elements might be lost after the transformation.
+    if (!isMultipleOfN(WideEVL, DL, Factor))
+      return false;
 
-  auto *PtrTy = LI->getPointerOperandType();
-  auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+    auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+    VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+  }
+
+  Type *PtrTy = Ptr->getType();
+  unsigned AS = PtrTy->getPointerAddressSpace();
+  if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+    return false;
 
   // If the segment load is going to be performed segment at a time anyways
   // and there's only one element used, use a strided load instead.  This
@@ -104,25 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
     unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
-    Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
-    Value *VL = Builder.getInt32(VTy->getNumElements());
-
+    Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
+    // Note: Same VL as above, but i32 not xlen due to signature of
+    // vp.strided.load
+    VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+                                    VTy->getElementCount());
     CallInst *CI =
         Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
                                 {VTy, BasePtr->getType(), Stride->getType()},
                                 {BasePtr, Stride, Mask, VL});
-    CI->addParamAttr(
-        0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+    CI->addParamAttr(0,
+                     Attribute::getWithAlignment(CI->getContext(), Alignment));
     Shuffles[0]->replaceAllUsesWith(CI);
     return true;
   };
 
-  Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
-  Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
   CallInst *VlsegN = Builder.CreateIntrinsic(
-      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
-      {LI->getPointerOperand(), Mask, VL});
+      FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
 
   for (unsigned i = 0; i < Shuffles.size(); i++) {
     Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -132,18 +192,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
   return true;
 }
 
-static const Intrinsic::ID FixedVssegIntrIds[] = {
-    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
-    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
-    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
-    Intrinsic::riscv_seg8_store_mask};
-
-static const Intrinsic::ID ScalableVssegIntrIds[] = {
-    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
-    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
-    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
-    Intrinsic::riscv_vsseg8_mask};
-
 /// Lower an interleaved store into a vssegN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
@@ -191,7 +239,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
     Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
     Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
     Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
-    Value *VL = Builder.getInt32(VTy->getNumElements());
+    Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
+                                           VTy->getElementCount());
 
     CallInst *CI = Builder.CreateIntrinsic(
         Intrinsic::experimental_vp_strided_store,
@@ -223,7 +272,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   // This VL should be OK (should be executable in one vsseg instruction,
   // potentially under larger LMULs) because we checked that the fixed vector
   // type fits in isLegalInterleavedAccessType
-  Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
+  Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
   Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount());
   Ops.append({SI->getPointerOperand(), StoreMask, VL});
 
@@ -233,58 +282,83 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 }
 
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
-    LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const {
-  const unsigned Factor = DeinterleaveValues.size();
+    Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
+  const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
   if (Factor > 8)
     return false;
 
-  assert(LI->isSimple());
-  IRBuilder<> Builder(LI);
+  IRBuilder<> Builder(Load);
 
-  Value *FirstActive =
-      *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; });
-  VectorType *ResVTy = cast<VectorType>(FirstActive->getType());
+  VectorType *ResVTy = getDeinterleavedVectorType(DI);
 
-  const DataLayout &DL = LI->getDataLayout();
+  const DataLayout &DL = Load->getDataLayout();
+  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+
+  Value *Ptr, *VL;
+  Align Alignment;
+  if (auto *LI = dyn_cast<LoadInst>(Load)) {
+    assert(LI->isSimple());
+    Ptr = LI->getPointerOperand();
+    Alignment = LI->getAlign();
+    assert(!Mask && "Unexpected mask on a load\n");
+    Mask = Builder.getAllOnesMask(ResVTy->getElementCount());
+    VL = isa<FixedVectorType>(ResVTy)
+             ? Builder.CreateElementCount(XLenTy, ResVTy->getElementCount())
+             : Constant::getAllOnesValue(XLenTy);
+  } else {
+    auto *VPLoad = cast<VPIntrinsic>(Load);
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
+           "Unexpected intrinsic");
+    Ptr = VPLoad->getMemoryPointerParam();
+    Alignment = VPLoad->getPointerAlignment().value_or(
+        DL.getABITypeAlign(ResVTy->getElementType()));
+
+    assert(Mask && "vp.load needs a mask!");
+
+    Value *WideEVL = VPLoad->getVectorLengthParam();
+    // Conservatively check if EVL is a multiple of factor, otherwise some
+    // (trailing) elements might be lost after the transformation.
+    if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
+      return false;
+
+    auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+    VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+  }
 
-  if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
-                                    LI->getPointerAddressSpace(), DL))
+  Type *PtrTy = Ptr->getType();
+  unsigned AS = PtrTy->getPointerAddressSpace();
+  if (!isLegalInterleavedAccessType(ResVTy, Factor, Alignment, AS, DL))
     return false;
 
   Value *Return;
-  Type *PtrTy = LI->getPointerOperandType();
-  Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
-
-  if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) {
-    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
+  if (isa<FixedVectorType>(ResVTy)) {
     Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
-                                     {ResVTy, PtrTy, XLenTy},
-                                     {LI->getPointerOperand(), Mask, VL});
+                                     {ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
   } else {
-    static const Intrinsic::ID IntrIds[] = {
-        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
-        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
-        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-        Intrinsic::riscv_vlseg8};
-
     unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType());
     unsigned NumElts = ResVTy->getElementCount().getKnownMinValue();
     Type *VecTupTy = TargetExtType::get(
-        LI->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(LI->getContext()),
+        Load->getContext(), "riscv.vector.tuple",
+        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
                                 NumElts * SEW / 8),
         Factor);
+    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+        Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
+        {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
 
-    Value *VL = Constant::getAllOnesValue(XLenTy);
+    Value *Operands[] = {
+        PoisonValue::get(VecTupTy),
+        Ptr,
+        Mask,
+        VL,
+        ConstantInt::get(XLenTy,
+                         RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
+        ConstantInt::get(XLenTy, Log2_64(SEW))};
 
-    Value *Vlseg = Builder.CreateIntrinsic(
-        IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy},
-        {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
-         ConstantInt::get(XLenTy, Log2_64(SEW))});
+    CallInst *Vlseg = Builder.CreateCall(VlsegNFunc, Operands);
 
     SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
-    Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes));
+    Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
     for (unsigned i = 0; i < Factor; ++i) {
       Value *VecExtract = Builder.CreateIntrinsic(
           Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
@@ -293,217 +367,87 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
     }
   }
 
-  for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) {
-    if (!DIV)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIV->replaceAllUsesWith(NewEV);
-  }
-
+  DI->replaceAllUsesWith(Return);
   return true;
 }
 
 bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
-    StoreInst *SI, ArrayRef<Value *> InterleaveValues) const {
+    Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const {
   unsigned Factor = InterleaveValues.size();
   if (Factor > 8)
     return false;
 
-  assert(SI->isSimple());
-  IRBuilder<> Builder(SI);
+  IRBuilder<> Builder(Store);
 
   auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType());
-  auto *PtrTy = SI->getPointerOperandType();
-  const DataLayout &DL = SI->getDataLayout();
+  const DataLayout &DL = Store->getDataLayout();
+  Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+
+  Value *Ptr, *VL;
+  Align Alignment;
+  if (auto *SI = dyn_cast<StoreInst>(Store)) {
+    assert(SI->isSimple());
+    Ptr = SI->getPointerOperand();
+    Alignment = SI->getAlign();
+    assert(!Mask && "Unexpected mask on a store");
+    Mask = Builder.getAllOnesMask(InVTy->getElementCount());
+    VL = isa<FixedVectorType>(InVTy)
+             ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount())
+             : Constant::getAllOnesValue(XLenTy);
+  } else {
+    auto *VPStore = cast<VPIntrinsic>(Store);
+    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store &&
+           "Unexpected intrinsic");
+    Ptr = VPStore->getMemoryPointerParam();
+    Alignment = VPStore->getPointerAlignment().value_or(
+        DL.getABITypeAlign(InVTy->getElementType()));
+
+    assert(Mask && "vp.store needs a mask!");
+
+    Value *WideEVL = VPStore->getVectorLengthParam();
+    // Conservatively check if EVL is a multiple of factor, otherwise some
+    // (trailing) elements might be lost after the transformation.
+    if (!isMultipleOfN(WideEVL, DL, Factor))
+      return false;
 
-  if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(), DL))
+    auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+    VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+  }
+  Type *PtrTy = Ptr->getType();
+  unsigned AS = Ptr->getType()->getPointerAddressSpace();
+  if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL))
     return false;
 
-  Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
-
-  if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) {
+  if (isa<FixedVectorType>(InVTy)) {
     Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-        SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy});
-
+        Store->getModule(), FixedVssegIntrIds[Factor - 2],
+        {InVTy, PtrTy, XLenTy});
     SmallVector<Value *, 10> Ops(InterleaveValues);
-    Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount());
-    Ops.append({SI->getPointerOperand(), Mask, VL});
-
+    Ops.append({Ptr, Mask, VL});
     Builder.CreateCall(VssegNFunc, Ops);
-  } else {
-    static const Intrinsic::ID IntrIds[] = {
-        Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
-        Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
-        Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
-        Intrinsic::riscv_vsseg8};
-
-    unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
-    unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
-    Type *VecTupTy = TargetExtType::get(
-        SI->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(SI->getContext()),
-                                NumElts * SEW / 8),
-        Factor);
-
-    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-        SI->getModule(), IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy});
-
-    Value *VL = Constant::getAllOnesValue(XLenTy);
-
-    Value *StoredVal = PoisonValue::get(VecTupTy);
-    for (unsigned i = 0; i < Factor; ++i)
-      StoredVal = Builder.CreateIntrinsic(
-          Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
-          {StoredVal, InterleaveValues[i], Builder.getInt32(i)});
-
-    Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL,
-                                    ConstantInt::get(XLenTy, Log2_64(SEW))});
-  }
-
-  return true;
-}
-
-static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
-  assert(N);
-  if (N == 1)
-    return true;
-
-  using namespace PatternMatch;
-  // Right now we're only recognizing the simplest pattern.
-  uint64_t C;
-  if (match(V, m_CombineOr(m_ConstantInt(C),
-                           m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
-      C && C % N == 0)
     return true;
-
-  if (isPowerOf2_32(N)) {
-    KnownBits KB = llvm::computeKnownBits(V, DL);
-    return KB.countMinTrailingZeros() >= Log2_32(N);
   }
+  unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType());
+  unsigned NumElts = InVTy->getElementCount().getKnownMinValue();
+  Type *VecTupTy = TargetExtType::get(
+      Store->getContext(), "riscv.vector.tuple",
+      ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
+                              NumElts * SEW / 8),
+      Factor);
 
-  return false;
-}
-
-/// Lower an interleaved vp.load into a vlsegN intrinsic.
-///
-/// E.g. Lower an interleaved vp.load (Factor = 2):
-///   %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
-///                                                         %mask,
-///                                                         i32 %wide.rvl)
-///   %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-///             @llvm.vector.deinterleave2.nxv64i8(
-///               <vscale x 64 x i8> %l)
-///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
-///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
-///
-/// Into:
-///   %rvl = udiv %wide.rvl, 2
-///   %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
-///             @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
-///                                                 <vscale x 32 x i8> undef,
-///                                                 ptr %ptr,
-///                                                 %mask,
-///                                                 i64 %rvl,
-///                                                 i64 1)
-///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
-///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
-///
-/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
-/// removed by the caller
-/// TODO: We probably can loosen the dependency on matching extractvalue when
-/// dealing with factor of 2 (extractvalue is still required for most of other
-/// factors though).
-bool RISCVTargetLowering::lowerInterleavedVPLoad(
-    VPIntrinsic *Load, Value *Mask,
-    ArrayRef<Value *> DeinterleaveResults) const {
-  const unsigned Factor = DeinterleaveResults.size();
-  assert(Mask && "Expect a valid mask");
-  assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
-         "Unexpected intrinsic");
-
-  Value *FirstActive = *llvm::find_if(DeinterleaveResults,
-                                      [](Value *V) { return V != nullptr; });
-  VectorType *VTy = cast<VectorType>(FirstActive->getType());
-
-  auto &DL = Load->getModule()->getDataLayout();
-  Align Alignment = Load->getParamAlign(0).value_or(
-      DL.getABITypeAlign(VTy->getElementType()));
-  if (!isLegalInterleavedAccessType(
-          VTy, Factor, Alignment,
-          Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
-    return false;
-
-  IRBuilder<> Builder(Load);
-
-  Value *WideEVL = Load->getVectorLengthParam();
-  // Conservatively check if EVL is a multiple of factor, otherwise some
-  // (trailing) elements might be lost after the transformation.
-  if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
-    return false;
-
-  auto *PtrTy = Load->getArgOperand(0)->getType();
-  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExt(
-      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
-      XLenTy);
-
-  Value *Return = nullptr;
-  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
-    Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
-                                     {FVTy, PtrTy, XLenTy},
-                                     {Load->getArgOperand(0), Mask, EVL});
-  } else {
-    unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-    unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-    Type *VecTupTy = TargetExtType::get(
-        Load->getContext(), "riscv.vector.tuple",
-        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
-                                NumElts * SEW / 8),
-        Factor);
-
-    Value *PoisonVal = PoisonValue::get(VecTupTy);
-
-    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-        Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
-        {VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
-
-    Value *Operands[] = {
-        PoisonVal,
-        Load->getArgOperand(0),
-        Mask,
-        EVL,
-        ConstantInt::get(XLenTy,
-                         RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
-        ConstantInt::get(XLenTy, Log2_64(SEW))};
-
-    CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
-
-    SmallVector<Type *, 8> AggrTypes{Factor, VTy};
-    Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
-    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
-        Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
-    for (unsigned i = 0; i < Factor; ++i) {
-      Value *VecExtract =
-          Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
-      Return = Builder.CreateInsertValue(Return, VecExtract, i);
-    }
-  }
+  Value *StoredVal = PoisonValue::get(VecTupTy);
+  for (unsigned i = 0; i < Factor; ++i)
+    StoredVal = Builder.CreateIntrinsic(
+        Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
+        {StoredVal, InterleaveValues[i], Builder.getInt32(i)});
 
-  for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
-    if (!DIO)
-      continue;
-    // We have to create a brand new ExtractValue to replace each
-    // of these old ExtractValue instructions.
-    Value *NewEV =
-        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
-    DIO->replaceAllUsesWith(NewEV);
-  }
+  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+      Store->getModule(), ScalableVssegIntrIds[Factor - 2],
+      {VecTupTy, PtrTy, Mask->getType(), VL->getType()});
 
+  Value *Operands[] = {StoredVal, Ptr, Mask, VL,
+                       ConstantInt::get(XLenTy, Log2_64(SEW))};
+  Builder.CreateCall(VssegNFunc, Operands);
   return true;
 }
 
@@ -557,15 +501,15 @@ bool RISCVTargetLowering::lowerInterleavedVPStore(
 
   auto *PtrTy = Store->getArgOperand(1)->getType();
   auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExt(
-      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
-      XLenTy);
+  auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+  Value *EVL =
+      Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
 
-  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+  if (isa<FixedVectorType>(VTy)) {
     SmallVector<Value *, 8> Operands(InterleaveOperands);
     Operands.append({Store->getArgOperand(1), Mask, EVL});
     Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
-                            {FVTy, PtrTy, XLenTy}, Operands);
+                            {VTy, PtrTy, XLenTy}, Operands);
     return true;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
index 05388f2d13113..3e286a754e4ee 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
@@ -13,6 +13,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
 def SpacemitX60Model : SchedMachineModel {
   let IssueWidth        = 2; // dual-issue
   let MicroOpBufferSize = 0; // in-order
@@ -44,6 +55,19 @@ let BufferSize = 0 in {
   // floating point instructions, this model assumes single issue as
   // increasing it reduces the gains we saw in performance
   def SMX60_FP : ProcResource<1>;
+
+  // Vector pipeline
+  // Single issue for vector store/load instructions
+  def SMX60_VLS : ProcResource<1>;
+
+  // The C908 user manual says: "Vector floating-point units support vector
+  // floating-point computation of different bits. In addition, vector integer
+  // units are added". Developer confirmed it's a separate VIEU
+  def SMX60_VIEU : ProcResource<1>;
+
+  // The C908 user manual says: "The vector execution unit is developed by
+  // extending the floating-point unit", so let's assume single issue for now
+  def SMX60_VFP : ProcResource<1>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -232,9 +256,341 @@ let Latency = 4 in {
   def : WriteRes<WriteFMovI32ToF32, [SMX60_IEU]>;
 }
 
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETIVLI, [SMX60_IEUA]>;
+def : WriteRes<WriteVSETVL, [SMX60_IEUA]>;
+
+// 7. Vector Loads and Stores
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  // Unit-stride loads and stores
+  defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>;
+
+  // Mask loads and stores
+  defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+  defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>;
+
+  // Strided and indexed loads and stores
+  foreach eew = [8, 16, 32, 64] in {
+    defm "" : LMULWriteResMX<"WriteVLDS"  # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+
+    defm "" : LMULWriteResMX<"WriteVSTS"  # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>;
+  }
+}
+
+// Segmented loads and stores
+foreach mx = SchedMxList in {
+  foreach nf=2-8 in {
+    foreach eew = [8, 16, 32, 64] in {
+      defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+      // Unit-stride segmented
+      defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+      // Strided/indexed segmented
+      defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+
+      // Indexed segmented
+      defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>;
+    }
+  }
+}
+
+// Whole register move/load/store
+foreach LMul = [1, 2, 4, 8] in {
+  def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>;
+  def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>;
+
+  def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>;
+}
+
+// 11. Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// Vector Integer Division and Remainder
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+// Narrowing Shift and Clips
+foreach mx = SchedMxListW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 13. Vector Floating-Point Instructions
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, isF=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, isF=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+  foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListFW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+  foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// Narrowing
+foreach mx = SchedMxListW in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
+
+  defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
+}
+
+foreach mx = SchedMxListFW in {
+  foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// Vector Floating-Point Division and Square Root
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// 14. Vector Reduction Operations
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListWRed in {
+  foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListFWRed in {
+  foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>;
+  }
+}
+
+// 15. Vector Mask Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+// 16. Vector Permutation Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
+
+  defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
+def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
+
+def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
+def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
+
+// Gather and Compress
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+    defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
+
+  defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
+  defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
+}
+
 // Others
 def : WriteRes<WriteCSR, [SMX60_IEU]>;
 def : WriteRes<WriteNop, [SMX60_IEU]>;
+def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>;
 
 //===----------------------------------------------------------------------===//
 // Bypass and advance
@@ -341,10 +697,184 @@ def : ReadAdvance<ReadCLMUL, 0>;
 def : ReadAdvance<ReadSingleBit, 0>;
 def : ReadAdvance<ReadSingleBitImm, 0>;
 
+// 6. Configuration-Setting Instructions
+def : ReadAdvance<ReadVSETVLI, 0>;
+def : ReadAdvance<ReadVSETVL, 0>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTEV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTM", 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTS8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS64V", 0>;
+defm "" : LMULReadAdvance<"ReadVLDUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVLDOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+defm : LMULReadAdvance<"ReadVIALUV", 0>;
+defm : LMULReadAdvance<"ReadVIALUX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
+defm : LMULReadAdvance<"ReadVExtV", 0>;
+defm : LMULReadAdvance<"ReadVICALUV", 0>;
+defm : LMULReadAdvance<"ReadVICALUX", 0>;
+defm : LMULReadAdvance<"ReadVShiftV", 0>;
+defm : LMULReadAdvance<"ReadVShiftX", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
+defm : LMULReadAdvance<"ReadVICmpV", 0>;
+defm : LMULReadAdvance<"ReadVICmpX", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
+defm : LMULReadAdvance<"ReadVIMulV", 0>;
+defm : LMULReadAdvance<"ReadVIMulX", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
+defm : LMULReadAdvance<"ReadVIMergeV", 0>;
+defm : LMULReadAdvance<"ReadVIMergeX", 0>;
+defm : LMULReadAdvance<"ReadVIMovV", 0>;
+defm : LMULReadAdvance<"ReadVIMovX", 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+defm "" : LMULReadAdvance<"ReadVSALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVSALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulX", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftV", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftX", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>;
+
+// 14. Vector Floating-Point Instructions
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>;
+defm "" : LMULReadAdvance<"ReadVFClassV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMovF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+defm "" : LMULReadAdvance<"ReadVMALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVMPopV", 0>;
+defm "" : LMULReadAdvance<"ReadVMFFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
+defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
+foreach mx = SchedMxList in {
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
+  foreach sew = SchedSEWSet<mx>.val in
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Unsupported extensions
 defm : UnsupportedSchedQ;
-defm : UnsupportedSchedV;
 defm : UnsupportedSchedZabha;
 defm : UnsupportedSchedZbkb;
 defm : UnsupportedSchedZbkx;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 2d9f38221d424..e656e8bb99d86 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -747,6 +747,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
     return TwoTimes ? MILog2SEW + 1 : MILog2SEW;
   }
 
+  // Vector Register Gather with 16-bit Index Elements Instruction
+  // Dest and source data EEW=SEW. Index vector EEW=16.
+  case RISCV::VRGATHEREI16_VV: {
+    if (MO.getOperandNo() == 2)
+      return 4;
+    return MILog2SEW;
+  }
+
   default:
     return std::nullopt;
   }
@@ -966,6 +974,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VADC_VIM:
   case RISCV::VADC_VVM:
   case RISCV::VADC_VXM:
+  case RISCV::VMADC_VIM:
+  case RISCV::VMADC_VVM:
+  case RISCV::VMADC_VXM:
+  case RISCV::VSBC_VVM:
+  case RISCV::VSBC_VXM:
+  case RISCV::VMSBC_VVM:
+  case RISCV::VMSBC_VXM:
   // Vector Widening Integer Multiply-Add Instructions
   case RISCV::VWMACCU_VV:
   case RISCV::VWMACCU_VX:
@@ -1051,6 +1066,11 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VSLIDEDOWN_VI:
   case RISCV::VSLIDE1UP_VX:
   case RISCV::VFSLIDE1UP_VF:
+  // Vector Register Gather Instructions
+  case RISCV::VRGATHER_VI:
+  case RISCV::VRGATHER_VV:
+  case RISCV::VRGATHER_VX:
+  case RISCV::VRGATHEREI16_VV:
   // Vector Single-Width Floating-Point Add/Subtract Instructions
   case RISCV::VFADD_VF:
   case RISCV::VFADD_VV:
@@ -1132,6 +1152,8 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VMFLE_VV:
   case RISCV::VMFGT_VF:
   case RISCV::VMFGE_VF:
+  // Vector Floating-Point Classify Instruction
+  case RISCV::VFCLASS_V:
   // Vector Floating-Point Merge Instruction
   case RISCV::VFMERGE_VFM:
   // Vector Floating-Point Move Instruction
@@ -1346,9 +1368,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const {
     const MachineInstr &UserMI = *UserOp.getParent();
     LLVM_DEBUG(dbgs() << "  Checking user: " << UserMI << "\n");
 
-    if (UserMI.isCopy() && UserMI.getOperand(0).getReg().isVirtual() &&
-        UserMI.getOperand(0).getSubReg() == RISCV::NoSubRegister &&
-        UserMI.getOperand(1).getSubReg() == RISCV::NoSubRegister) {
+    if (UserMI.isFullCopy() && UserMI.getOperand(0).getReg().isVirtual()) {
       LLVM_DEBUG(dbgs() << "    Peeking through uses of COPY\n");
       Worklist.insert_range(llvm::make_pointer_range(
           MRI->use_operands(UserMI.getOperand(0).getReg())));
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
index be54a8c95a978..3bd2705f021a6 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -10,6 +10,10 @@
 // instructions and masked instructions, so that we can reduce the live range
 // overlaps of mask registers.
 //
+// If there are multiple masks producers followed by multiple masked
+// instructions, then at each masked instructions add dependency edges between
+// every producer and masked instruction.
+//
 // The reason why we need to do this:
 // 1. When tracking register pressure, we don't track physical registers.
 // 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't
@@ -40,9 +44,8 @@
 namespace llvm {
 
 static bool isCopyToV0(const MachineInstr &MI) {
-  return MI.isCopy() && MI.getOperand(0).getReg() == RISCV::V0 &&
-         MI.getOperand(1).getReg().isVirtual() &&
-         MI.getOperand(1).getSubReg() == RISCV::NoSubRegister;
+  return MI.isFullCopy() && MI.getOperand(0).getReg() == RISCV::V0 &&
+         MI.getOperand(1).getReg().isVirtual();
 }
 
 static bool isSoleUseCopyToV0(SUnit &SU) {
@@ -68,11 +71,27 @@ class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation {
 
   void apply(ScheduleDAGInstrs *DAG) override {
     SUnit *NearestUseV0SU = nullptr;
+    SmallVector<SUnit *, 2> DefMask;
     for (SUnit &SU : DAG->SUnits) {
       const MachineInstr *MI = SU.getInstr();
-      if (MI->findRegisterUseOperand(RISCV::V0, TRI))
+      bool UseV0 = MI->findRegisterUseOperand(RISCV::V0, TRI);
+      if (isSoleUseCopyToV0(SU) && !UseV0)
+        DefMask.push_back(&SU);
+
+      if (UseV0) {
         NearestUseV0SU = &SU;
 
+        // Copy may not be a real use, so skip it here.
+        if (DefMask.size() > 1 && !MI->isCopy()) {
+          for (SUnit *Def : DefMask)
+            if (DAG->canAddEdge(Def, &SU))
+              DAG->addEdge(Def, SDep(&SU, SDep::Artificial));
+        }
+
+        if (!DefMask.empty())
+          DefMask.erase(DefMask.begin());
+      }
+
       if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) &&
           // For LMUL=8 cases, there will be more possibilities to spill.
           // FIXME: We should use RegPressureTracker to do fine-grained
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 2a424e673ddf6..a7f6fbceffc3f 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6897865eb4e15..ea78dcd135267 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro
 defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>;
 defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>;
 defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>;
-defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>;
 
 // GetQuery builtin records:
 defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>;
@@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>;
 defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
 defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>;
 defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>;
+defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>;
 defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fd0bea0b90472..6608b3f2cbefd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3120,6 +3120,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract);
   case Intrinsic::spv_normalize:
     return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize);
+  case Intrinsic::spv_refract:
+    return selectExtInst(ResVReg, ResType, I, GL::Refract);
   case Intrinsic::spv_reflect:
     return selectExtInst(ResVReg, ResType, I, GL::Reflect);
   case Intrinsic::spv_rsqrt:
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index 2a581d381d4ab..4a9c88bfa6d39 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -68,7 +68,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
 
   // Extract the relocation type from the fixup kind, after applying STT_TLS as
   // needed.
-  unsigned Kind = Fixup.getTargetKind();
+  auto Kind = Fixup.getKind();
   if (mc::isRelocation(Fixup.getKind()))
     return Kind;
 
@@ -93,7 +93,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
   }
 
   // clang-format off
-  switch(Fixup.getTargetKind()) {
+  switch(Fixup.getKind()) {
   default:
     llvm_unreachable("Unimplemented fixup -> relocation");
   case FK_NONE:                  return ELF::R_SPARC_NONE;
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 2335853469467..cfa3511436b97 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "SparcMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 711bf9b31a377..b191964759082 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcTargetObjectFile.h"
-#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 0920c3345ecf3..9b03e85ca45bf 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -191,6 +191,35 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer,
   }
 }
 
+void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer,
+                                                           Module &M) const {
+  NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName);
+  if (!FuncInfo)
+    return;
+
+  // Emit a descriptor for every function including functions that have an
+  // available external linkage. We may not want this for imported functions
+  // that has code in another thinLTO module but we don't have a good way to
+  // tell them apart from inline functions defined in header files. Therefore
+  // we put each descriptor in a separate comdat section and rely on the
+  // linker to deduplicate.
+  auto &C = getContext();
+  for (const auto *Operand : FuncInfo->operands()) {
+    const auto *MD = cast<MDNode>(Operand);
+    auto *GUID = mdconst::extract<ConstantInt>(MD->getOperand(0));
+    auto *Hash = mdconst::extract<ConstantInt>(MD->getOperand(1));
+    auto *Name = cast<MDString>(MD->getOperand(2));
+    auto *S = C.getObjectFileInfo()->getPseudoProbeDescSection(
+        TM->getFunctionSections() ? Name->getString() : StringRef());
+
+    Streamer.switchSection(S);
+    Streamer.emitInt64(GUID->getZExtValue());
+    Streamer.emitInt64(Hash->getZExtValue());
+    Streamer.emitULEB128IntValue(Name->getString().size());
+    Streamer.emitBytes(Name->getString());
+  }
+}
+
 /// getKindForGlobal - This is a top-level target-independent classifier for
 /// a global object.  Given a global variable and information from the TM, this
 /// function classifies the global in a target independent manner. This function
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
index e09a916d48c90..f987621522477 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -154,7 +154,7 @@ class ELFVEAsmBackend : public VEAsmBackend {
 void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
                               const MCValue &Target, MutableArrayRef<char> Data,
                               uint64_t Value, bool IsResolved) {
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   case VE::fixup_ve_tls_gd_hi32:
   case VE::fixup_ve_tls_gd_lo32:
   case VE::fixup_ve_tpoff_hi32:
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index 1597e7d080f03..41f31eb3b8199 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -56,7 +56,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
   }
 
   if (IsPCRel) {
-    switch (Fixup.getTargetKind()) {
+    switch (Fixup.getKind()) {
     default:
       reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
       return ELF::R_VE_NONE;
@@ -84,7 +84,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
     }
   }
 
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   default:
     reportError(Fixup.getLoc(), "Unknown ELF relocation type");
     return ELF::R_VE_NONE;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 656d5dd327739..28f65990120cb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -261,7 +261,6 @@
 ///
 ///===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssembly.h"
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/ADT/StringExtras.h"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index ad47cb8ea2fe1..6827ee6527947 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index ff2df3d5b192a..3d060c6f4a780 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -177,20 +178,20 @@ class X86AsmBackend : public MCAsmBackend {
   bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands,
                          const MCSubtargetInfo &STI) const override;
 
-  bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t,
+  bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &,
+                                    const MCValue &, uint64_t,
                                     bool) const override;
 
   void relaxInstruction(MCInst &Inst,
                         const MCSubtargetInfo &STI) const override;
 
-  bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
-                                   MCCodeEmitter &Emitter,
+  bool padInstructionViaRelaxation(MCFragment &RF, MCCodeEmitter &Emitter,
                                    unsigned &RemainingSize) const;
 
-  bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+  bool padInstructionViaPrefix(MCFragment &RF, MCCodeEmitter &Emitter,
                                unsigned &RemainingSize) const;
 
-  bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+  bool padInstructionEncoding(MCFragment &RF, MCCodeEmitter &Emitter,
                               unsigned &RemainingSize) const;
 
   bool finishLayout(const MCAssembler &Asm) const override;
@@ -409,10 +410,9 @@ isRightAfterData(MCFragment *CurrentFragment,
   //       it, returns true.
   //     - Otherwise returns false.
   //   - If the fragment is not a DataFragment, returns false.
-  if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
-    return DF->getContents().size() &&
-           (DF != PrevInstPosition.first ||
-            DF->getContents().size() != PrevInstPosition.second);
+  if (F->getKind() == MCFragment::FT_Data)
+    return F->getFixedSize() && (F != PrevInstPosition.first ||
+                                 F->getFixedSize() != PrevInstPosition.second);
 
   return false;
 }
@@ -421,11 +421,7 @@ isRightAfterData(MCFragment *CurrentFragment,
 static size_t getSizeForInstFragment(const MCFragment *F) {
   if (!F || !F->hasInstructions())
     return 0;
-  // MCEncodedFragmentWithContents being templated makes this tricky.
-  if (auto *DF = dyn_cast<MCEncodedFragment>(F))
-    return DF->getContents().size();
-  else
-    llvm_unreachable("Unknown fragment with instructions!");
+  return F->getSize();
 }
 
 /// Return true if we can insert NOP or prefixes automatically before the
@@ -468,10 +464,6 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
   if (!OS.getCurrentSectionOnly()->isText())
     return false;
 
-  // To be Done: Currently don't deal with Bundle cases.
-  if (OS.getAssembler().isBundlingEnabled())
-    return false;
-
   // Branches only need to be aligned in 32-bit or 64-bit mode.
   if (!(STI.hasFeature(X86::Is64Bit) || STI.hasFeature(X86::Is32Bit)))
     return false;
@@ -551,8 +543,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
 void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
                                        const MCInst &Inst) {
   MCFragment *CF = OS.getCurrentFragment();
-  if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
-    F->setAllowAutoPadding(canPadInst(Inst, OS));
+  if (CF->getKind() == MCFragment::FT_Relaxable)
+    CF->setAllowAutoPadding(canPadInst(Inst, OS));
 
   // Update PrevInstOpcode here, canPadInst() reads that.
   PrevInstOpcode = Inst.getOpcode();
@@ -575,8 +567,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
   // DataFragment, so that we can get the size of instructions later in
   // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
   // DataFragment.
-  if (isa_and_nonnull<MCDataFragment>(CF))
-    OS.insert(OS.getContext().allocFragment<MCDataFragment>());
+  OS.insert(OS.getContext().allocFragment<MCFragment>());
 
   // Update the maximum alignment on the current section if necessary.
   MCSection *Sec = OS.getCurrentSectionOnly();
@@ -686,7 +677,7 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &,
                                                  MCFixup &Fixup,
                                                  MCValue &Target, uint64_t &) {
   if (Fixup.isPCRel()) {
-    switch (Fixup.getTargetKind()) {
+    switch (Fixup.getKind()) {
     case FK_Data_1:
       Target.setConstant(Target.getConstant() - 1);
       break;
@@ -756,7 +747,8 @@ bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode,
           Operands[Operands.size() - 1 - SkipOperands].isExpr());
 }
 
-bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
+bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &,
+                                                 const MCFixup &Fixup,
                                                  const MCValue &Target,
                                                  uint64_t Value,
                                                  bool Resolved) const {
@@ -785,7 +777,7 @@ void X86AsmBackend::relaxInstruction(MCInst &Inst,
   Inst.setOpcode(RelaxedOp);
 }
 
-bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
+bool X86AsmBackend::padInstructionViaPrefix(MCFragment &RF,
                                             MCCodeEmitter &Emitter,
                                             unsigned &RemainingSize) const {
   if (!RF.getAllowAutoPadding())
@@ -798,7 +790,7 @@ bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
                         *RF.getSubtargetInfo()))
     return false;
 
-  const unsigned OldSize = RF.getContents().size();
+  const unsigned OldSize = RF.getVarSize();
   if (OldSize == 15)
     return false;
 
@@ -827,19 +819,18 @@ bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
 
   SmallString<256> Code;
   Code.append(PrefixBytesToAdd, Prefix);
-  Code.append(RF.getContents().begin(), RF.getContents().end());
-  RF.setContents(Code);
+  Code.append(RF.getVarContents().begin(), RF.getVarContents().end());
+  RF.setVarContents(Code);
 
   // Adjust the fixups for the change in offsets
-  for (auto &F : RF.getFixups()) {
-    F.setOffset(F.getOffset() + PrefixBytesToAdd);
-  }
+  for (auto &F : RF.getVarFixups())
+    F.setOffset(PrefixBytesToAdd + F.getOffset());
 
   RemainingSize -= PrefixBytesToAdd;
   return true;
 }
 
-bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
+bool X86AsmBackend::padInstructionViaRelaxation(MCFragment &RF,
                                                 MCCodeEmitter &Emitter,
                                                 unsigned &RemainingSize) const {
   if (!mayNeedRelaxation(RF.getOpcode(), RF.getOperands(),
@@ -854,20 +845,20 @@ bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
   SmallVector<MCFixup, 4> Fixups;
   SmallString<15> Code;
   Emitter.encodeInstruction(Relaxed, Code, Fixups, *RF.getSubtargetInfo());
-  const unsigned OldSize = RF.getContents().size();
+  const unsigned OldSize = RF.getVarContents().size();
   const unsigned NewSize = Code.size();
   assert(NewSize >= OldSize && "size decrease during relaxation?");
   unsigned Delta = NewSize - OldSize;
   if (Delta > RemainingSize)
     return false;
   RF.setInst(Relaxed);
-  RF.setContents(Code);
-  RF.setFixups(Fixups);
+  RF.setVarContents(Code);
+  RF.setVarFixups(Fixups);
   RemainingSize -= Delta;
   return true;
 }
 
-bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+bool X86AsmBackend::padInstructionEncoding(MCFragment &RF,
                                            MCCodeEmitter &Emitter,
                                            unsigned &RemainingSize) const {
   bool Changed = false;
@@ -900,7 +891,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
     if (!Sec.isText())
       continue;
 
-    SmallVector<MCRelaxableFragment *, 4> Relaxable;
+    SmallVector<MCFragment *, 4> Relaxable;
     for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
       MCFragment &F = *I;
 
@@ -911,7 +902,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
         continue;
 
       if (F.getKind() == MCFragment::FT_Relaxable) {
-        auto &RF = cast<MCRelaxableFragment>(*I);
+        auto &RF = cast<MCFragment>(*I);
         Relaxable.push_back(&RF);
         continue;
       }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index c34425f6661b4..0dabd98a38f44 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -258,7 +258,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
           // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
           // rewrite the movq to an leaq at link time if the symbol ends up in
           // the same linkage unit.
-          if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load)
+          if (Fixup.getKind() == X86::reloc_riprel_4byte_movq_load)
             Type = MachO::X86_64_RELOC_GOT_LOAD;
           else
             Type = MachO::X86_64_RELOC_GOT;
@@ -320,7 +320,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         return;
       } else {
         Type = MachO::X86_64_RELOC_UNSIGNED;
-        if (Fixup.getTargetKind() == X86::reloc_signed_4byte) {
+        if (Fixup.getKind() == X86::reloc_signed_4byte) {
           reportError(
               Fixup.getLoc(),
               "32-bit absolute addressing is not supported in 64-bit mode");
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0b4c63f7a81f7..5d5a705893242 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
+/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized
+/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment.
+/// Technically only fp128 has a specified ABI, but it makes sense to handle
+/// i128 the same until we hear differently.
+static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                 CCValAssign::LocInfo &LocInfo,
+                                 ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert(ValVT == MVT::i32 && "Should have i32 parts");
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  assert(PendingMembers.size() == 4 && "Should have four parts");
+
+  int64_t Offset = State.AllocateStack(16, Align(16));
+  PendingMembers[0].convertToMem(Offset);
+  PendingMembers[1].convertToMem(Offset + 4);
+  PendingMembers[2].convertToMem(Offset + 8);
+  PendingMembers[3].convertToMem(Offset + 12);
+
+  State.addLoc(PendingMembers[0]);
+  State.addLoc(PendingMembers[1]);
+  State.addLoc(PendingMembers[2]);
+  State.addLoc(PendingMembers[3]);
+  PendingMembers.clear();
+  return true;
+}
+
 // Provides entry points of CC_X86 and RetCC_X86.
 #include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 823e0caa02262..f020e0b55141c 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[
   // The 'nest' parameter, if any, is passed in ECX.
   CCIfNest<CCAssignToReg<[ECX]>>,
 
+  // i128 and fp128 need to be passed on the stack with a higher alignment than
+  // their legal types. Handle this with a custom function.
+  CCIfType<[i32],
+           CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>,
+
   // On swifttailcc pass swiftself in ECX.
   CCIfCC<"CallingConv::SwiftTail",
          CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index c7abb367fad28..0e6b4dffec3a6 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -376,8 +376,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::EH_RETURN64: {
     MachineOperand &DestAddr = MBBI->getOperand(0);
     assert(DestAddr.isReg() && "Offset should be in register!");
-    const bool Uses64BitFramePtr =
-        STI->isTarget64BitLP64() || STI->isTargetNaCl64();
+    const bool Uses64BitFramePtr = STI->isTarget64BitLP64();
     Register StackPtr = TRI->getStackRegister();
     BuildMI(MBB, MBBI, DL,
             TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 80b51cc5ab89b..95ed5908e2314 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -54,8 +54,8 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
   SlotSize = TRI->getSlotSize();
   Is64Bit = STI.is64Bit();
   IsLP64 = STI.isTarget64BitLP64();
-  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
-  Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+  // standard x86_64 uses 64-bit frame/stack pointers, x32 - 32-bit.
+  Uses64BitFramePtr = STI.isTarget64BitLP64();
   StackPtr = TRI->getStackRegister();
 }
 
@@ -2412,7 +2412,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   DebugLoc DL;
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
-  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+  // standard x86_64 uses 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Is64BitILP32 = STI.isTarget64BitILP32();
   Register FramePtr = TRI->getFrameRegister(MF);
   Register MachineFramePtr =
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 32c7d2bfea6c2..62073ec125e8f 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5428,10 +5428,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
   case ISD::BRIND:
   case X86ISD::NT_BRIND: {
-    if (Subtarget->isTargetNaCl())
-      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
-      // leave the instruction alone.
-      break;
     if (Subtarget->isTarget64BitILP32()) {
       // Converts a 32-bit register to a 64-bit, zero-extended version of
       // it. This is needed because x86-64 can do many things, but jmp %r32
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5e35d5630d667..d91ea1ea1bb1b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36615,8 +36615,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
            sizeVReg = MI.getOperand(1).getReg(),
-           physSPReg =
-               IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
+           physSPReg = IsLP64 ? X86::RSP : X86::ESP;
 
   MachineFunction::iterator MBBIter = ++BB->getIterator();
 
@@ -37121,8 +37120,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   // restoreMBB:
   if (RegInfo->hasBasePointer(*MF)) {
-    const bool Uses64BitFramePtr =
-        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+    const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
     X86FI->setRestoreBasePointer(MF);
     Register FramePtr = RegInfo->getFrameRegister(*MF);
@@ -37550,8 +37548,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered.
   if (RI.hasBasePointer(*MF)) {
-    const bool FPIs64Bit =
-        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+    const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
     MFI->setRestoreBasePointer(MF);
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6bcb7a36e91b5..26369792db26d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1661,7 +1661,7 @@ namespace llvm {
 
     /// Lower interleaved load(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedLoad(LoadInst *LI,
+    bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 9ad355311527b..b4639ac2577e8 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
 bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
     const DataLayout &DL) const {
-  // i128 split into i64 needs to be allocated to two consecutive registers,
-  // or spilled to the stack as a whole.
-  return Ty->isIntegerTy(128);
+  // On x86-64 i128 is split into two i64s and needs to be allocated to two
+  // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
+  // is split to four i32s and never actually passed in registers, but we use
+  // the consecutive register mark to match it in TableGen.
+  if (Ty->isIntegerTy(128))
+    return true;
+
+  // On x86-32, fp128 acts the same as i128.
+  if (Subtarget.is32Bit() && Ty->isFP128Ty())
+    return true;
+
+  return false;
 }
 
 /// Helper for getByValTypeAlignment to determine
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 307c03c8ef541..df1541e9085bb 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -214,8 +214,6 @@ def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
 }
 def IsPS         : Predicate<"Subtarget->isTargetPS()">;
 def NotPS        : Predicate<"!Subtarget->isTargetPS()">;
-def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
-def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
 def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
 def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 1eb47e3b2cd18..360293bce54e8 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 // number of shuffles and ISA.
 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 bool X86TargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(Load);
+  if (!LI)
+    return false;
+  assert(!Mask && "Unexpected mask on a load");
+
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);
   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index a8ee9f55611b6..8ad8d423d10cc 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -302,13 +302,12 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
     reportFatalUsageError("64-bit code requested on a subtarget that doesn't "
                           "support it!");
 
-  // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD, NaCl, and for all
+  // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD, and for all
   // 64-bit targets.  On Solaris (32-bit), stack alignment is 4 bytes
   // following the i386 psABI, while on Illumos it is always 16 bytes.
   if (StackAlignOverride)
     stackAlignment = *StackAlignOverride;
-  else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
-           isTargetNaCl() || Is64Bit)
+  else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || Is64Bit)
     stackAlignment = Align(16);
 
   // Consume the vector width attribute or apply any target specific limit.
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 38b8c246eb291..be49214e041ea 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -170,14 +170,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
 #include "X86GenSubtargetInfo.inc"
 
   /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
-  bool isTarget64BitILP32() const {
-    return Is64Bit && (TargetTriple.isX32() || TargetTriple.isOSNaCl());
-  }
+  bool isTarget64BitILP32() const { return Is64Bit && (TargetTriple.isX32()); }
 
   /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
-  bool isTarget64BitLP64() const {
-    return Is64Bit && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl());
-  }
+  bool isTarget64BitLP64() const { return Is64Bit && (!TargetTriple.isX32()); }
 
   PICStyles::Style getPICStyle() const { return PICStyle; }
   void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
@@ -299,9 +295,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
   bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
-  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
-  bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
-  bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
   bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
   bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
 
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 85cc5b43d40bf..6d9c6cdedd9e5 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -131,7 +131,7 @@ static std::string computeDataLayout(const Triple &TT) {
 
   Ret += DataLayout::getManglingComponent(TT);
   // X86 and x32 have 32 bit pointers.
-  if (!TT.isArch64Bit() || TT.isX32() || TT.isOSNaCl())
+  if (!TT.isArch64Bit() || TT.isX32())
     Ret += "-p:32:32";
 
   // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
@@ -140,7 +140,7 @@ static std::string computeDataLayout(const Triple &TT) {
   // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
   // 128 bit integers are not specified in the 32-bit ABIs but are used
   // internally for lowering f128, so we match the alignment to that.
-  if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+  if (TT.isArch64Bit() || TT.isOSWindows())
     Ret += "-i64:64-i128:128";
   else if (TT.isOSIAMCU())
     Ret += "-i64:32-f64:32";
@@ -148,7 +148,7 @@ static std::string computeDataLayout(const Triple &TT) {
     Ret += "-i128:128-f64:32:64";
 
   // Some ABIs align long double to 128 bits, others to 32.
-  if (TT.isOSNaCl() || TT.isOSIAMCU())
+  if (TT.isOSIAMCU())
     ; // No f80
   else if (TT.isArch64Bit() || TT.isOSDarwin() || TT.isWindowsMSVCEnvironment())
     Ret += "-f80:128";
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 27111fce45662..a650f6f069e5f 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -811,7 +811,7 @@ void WinEHStatePass::updateEspForInAllocas(Function &F) {
       if (auto *Alloca = dyn_cast<AllocaInst>(&I)) {
         if (Alloca->isStaticAlloca())
           continue;
-        IRBuilder<> Builder(Alloca->getNextNonDebugInstruction());
+        IRBuilder<> Builder(Alloca->getNextNode());
         // SavedESP = llvm.stacksave()
         Value *SP = Builder.CreateStackSave();
         Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
@@ -820,7 +820,7 @@ void WinEHStatePass::updateEspForInAllocas(Function &F) {
       if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
         if (II->getIntrinsicID() != Intrinsic::stackrestore)
           continue;
-        IRBuilder<> Builder(II->getNextNonDebugInstruction());
+        IRBuilder<> Builder(II->getNextNode());
         // SavedESP = llvm.stacksave()
         Value *SP = Builder.CreateStackSave();
         Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
index 671f1d04daf23..9167794a51e8b 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp
@@ -144,7 +144,7 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F,
   // For a few PC-relative fixups, offsets need to be aligned down. We
   // compensate here because the default handler's `Value` decrement doesn't
   // account for this alignment.
-  switch (Fixup.getTargetKind()) {
+  switch (Fixup.getKind()) {
   case Xtensa::fixup_xtensa_call_18:
   case Xtensa::fixup_xtensa_l32r_16:
     Value = (Asm->getFragmentOffset(F) + Fixup.getOffset()) % 4;
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 9ff6521c5d1e8..dcb30b7ba0410 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -648,7 +648,6 @@ StringRef ARM::getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch) {
     default:
       return "strongarm";
     }
-  case llvm::Triple::NaCl:
   case llvm::Triple::OpenBSD:
     return "cortex-a8";
   default:
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 8fd91fcd33f63..78bd5b4b5bd25 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1855,7 +1855,7 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) {
 
 #if defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLevel;
   StringMap<bool> Features;
@@ -2068,7 +2068,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   StringMap<bool> Features;
   std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
   if (!P)
@@ -2148,7 +2148,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64))
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   StringMap<bool> Features;
 
   // If we're asking the OS at runtime, believe what the OS says
@@ -2167,7 +2167,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
 }
 #elif defined(__linux__) && defined(__loongarch__)
 #include <sys/auxv.h>
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   unsigned long hwcap = getauxval(AT_HWCAP);
   bool HasFPU = hwcap & (1UL << 3); // HWCAP_LOONGARCH_FPU
   uint32_t cpucfg2 = 0x2, cpucfg3 = 0x3;
@@ -2196,7 +2196,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(__linux__) && defined(__riscv)
-const StringMap<bool> sys::getHostCPUFeatures() {
+StringMap<bool> sys::getHostCPUFeatures() {
   RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0},
                        {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0},
                        {/*RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF=*/9, 0}};
@@ -2279,7 +2279,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #else
-const StringMap<bool> sys::getHostCPUFeatures() { return {}; }
+StringMap<bool> sys::getHostCPUFeatures() { return {}; }
 #endif
 
 #if __APPLE__
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index d7e206ef8cd4f..4ca7444a73b35 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["gfx1250-insts"] = true;
       Features["bitop3-insts"] = true;
       Features["prng-inst"] = true;
+      Features["tanh-insts"] = true;
       Features["transpose-load-f4f6-insts"] = true;
       Features["bf16-trans-insts"] = true;
       Features["fp8-conversion-insts"] = true;
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 0584c941d2e6e..be51453ee21d7 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -64,6 +64,10 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case renderscript64: return "renderscript64";
   case riscv32:        return "riscv32";
   case riscv64:        return "riscv64";
+  case riscv32be:
+    return "riscv32be";
+  case riscv64be:
+    return "riscv64be";
   case shave:          return "shave";
   case sparc:          return "sparc";
   case sparcel:        return "sparcel";
@@ -238,7 +242,10 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
   case wasm64:      return "wasm";
 
   case riscv32:
-  case riscv64:     return "riscv";
+  case riscv64:
+  case riscv32be:
+  case riscv64be:
+    return "riscv";
 
   case ve:          return "ve";
   case csky:        return "csky";
@@ -304,7 +311,6 @@ StringRef Triple::getOSTypeName(OSType Kind) {
     return "managarm";
   case Mesa3D: return "mesa3d";
   case NVCL: return "nvcl";
-  case NaCl: return "nacl";
   case NetBSD: return "netbsd";
   case OpenBSD: return "openbsd";
   case PS4: return "ps4";
@@ -427,71 +433,73 @@ static Triple::ArchType parseBPFArch(StringRef ArchName) {
 Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
   Triple::ArchType BPFArch(parseBPFArch(Name));
   return StringSwitch<Triple::ArchType>(Name)
-    .Case("aarch64", aarch64)
-    .Case("aarch64_be", aarch64_be)
-    .Case("aarch64_32", aarch64_32)
-    .Case("arc", arc)
-    .Case("arm64", aarch64) // "arm64" is an alias for "aarch64"
-    .Case("arm64_32", aarch64_32)
-    .Case("arm", arm)
-    .Case("armeb", armeb)
-    .Case("avr", avr)
-    .StartsWith("bpf", BPFArch)
-    .Case("m68k", m68k)
-    .Case("mips", mips)
-    .Case("mipsel", mipsel)
-    .Case("mips64", mips64)
-    .Case("mips64el", mips64el)
-    .Case("msp430", msp430)
-    .Case("ppc64", ppc64)
-    .Case("ppc32", ppc)
-    .Case("ppc", ppc)
-    .Case("ppc32le", ppcle)
-    .Case("ppcle", ppcle)
-    .Case("ppc64le", ppc64le)
-    .Case("r600", r600)
-    .Case("amdgcn", amdgcn)
-    .Case("riscv32", riscv32)
-    .Case("riscv64", riscv64)
-    .Case("hexagon", hexagon)
-    .Case("sparc", sparc)
-    .Case("sparcel", sparcel)
-    .Case("sparcv9", sparcv9)
-    .Case("s390x", systemz)
-    .Case("systemz", systemz)
-    .Case("tce", tce)
-    .Case("tcele", tcele)
-    .Case("thumb", thumb)
-    .Case("thumbeb", thumbeb)
-    .Case("x86", x86)
-    .Case("i386", x86)
-    .Case("x86-64", x86_64)
-    .Case("xcore", xcore)
-    .Case("nvptx", nvptx)
-    .Case("nvptx64", nvptx64)
-    .Case("amdil", amdil)
-    .Case("amdil64", amdil64)
-    .Case("hsail", hsail)
-    .Case("hsail64", hsail64)
-    .Case("spir", spir)
-    .Case("spir64", spir64)
-    .Case("spirv", spirv)
-    .Case("spirv32", spirv32)
-    .Case("spirv64", spirv64)
-    .Case("kalimba", kalimba)
-    .Case("lanai", lanai)
-    .Case("shave", shave)
-    .Case("wasm32", wasm32)
-    .Case("wasm64", wasm64)
-    .Case("renderscript32", renderscript32)
-    .Case("renderscript64", renderscript64)
-    .Case("ve", ve)
-    .Case("csky", csky)
-    .Case("loongarch32", loongarch32)
-    .Case("loongarch64", loongarch64)
-    .Case("dxil", dxil)
-    .Case("xtensa", xtensa)
-    .Default(UnknownArch);
+      .Case("aarch64", aarch64)
+      .Case("aarch64_be", aarch64_be)
+      .Case("aarch64_32", aarch64_32)
+      .Case("arc", arc)
+      .Case("arm64", aarch64) // "arm64" is an alias for "aarch64"
+      .Case("arm64_32", aarch64_32)
+      .Case("arm", arm)
+      .Case("armeb", armeb)
+      .Case("avr", avr)
+      .StartsWith("bpf", BPFArch)
+      .Case("m68k", m68k)
+      .Case("mips", mips)
+      .Case("mipsel", mipsel)
+      .Case("mips64", mips64)
+      .Case("mips64el", mips64el)
+      .Case("msp430", msp430)
+      .Case("ppc64", ppc64)
+      .Case("ppc32", ppc)
+      .Case("ppc", ppc)
+      .Case("ppc32le", ppcle)
+      .Case("ppcle", ppcle)
+      .Case("ppc64le", ppc64le)
+      .Case("r600", r600)
+      .Case("amdgcn", amdgcn)
+      .Case("riscv32", riscv32)
+      .Case("riscv64", riscv64)
+      .Case("riscv32be", riscv32be)
+      .Case("riscv64be", riscv64be)
+      .Case("hexagon", hexagon)
+      .Case("sparc", sparc)
+      .Case("sparcel", sparcel)
+      .Case("sparcv9", sparcv9)
+      .Case("s390x", systemz)
+      .Case("systemz", systemz)
+      .Case("tce", tce)
+      .Case("tcele", tcele)
+      .Case("thumb", thumb)
+      .Case("thumbeb", thumbeb)
+      .Case("x86", x86)
+      .Case("i386", x86)
+      .Case("x86-64", x86_64)
+      .Case("xcore", xcore)
+      .Case("nvptx", nvptx)
+      .Case("nvptx64", nvptx64)
+      .Case("amdil", amdil)
+      .Case("amdil64", amdil64)
+      .Case("hsail", hsail)
+      .Case("hsail64", hsail64)
+      .Case("spir", spir)
+      .Case("spir64", spir64)
+      .Case("spirv", spirv)
+      .Case("spirv32", spirv32)
+      .Case("spirv64", spirv64)
+      .Case("kalimba", kalimba)
+      .Case("lanai", lanai)
+      .Case("shave", shave)
+      .Case("wasm32", wasm32)
+      .Case("wasm64", wasm64)
+      .Case("renderscript32", renderscript32)
+      .Case("renderscript64", renderscript64)
+      .Case("ve", ve)
+      .Case("csky", csky)
+      .Case("loongarch32", loongarch32)
+      .Case("loongarch64", loongarch64)
+      .Case("dxil", dxil)
+      .Case("xtensa", xtensa)
+      .Default(UnknownArch);
 }
 
 static Triple::ArchType parseARMArch(StringRef ArchName) {
@@ -560,84 +568,85 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
-  auto AT =
-      StringSwitch<Triple::ArchType>(ArchName)
-          .Cases("i386", "i486", "i586", "i686", Triple::x86)
-          // FIXME: Do we need to support these?
-          .Cases("i786", "i886", "i986", Triple::x86)
-          .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
-          .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
-          .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle)
-          .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
-          .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
-          .Case("xscale", Triple::arm)
-          .Case("xscaleeb", Triple::armeb)
-          .Case("aarch64", Triple::aarch64)
-          .Case("aarch64_be", Triple::aarch64_be)
-          .Case("aarch64_32", Triple::aarch64_32)
-          .Case("arc", Triple::arc)
-          .Case("arm64", Triple::aarch64)
-          .Case("arm64_32", Triple::aarch64_32)
-          .Case("arm64e", Triple::aarch64)
-          .Case("arm64ec", Triple::aarch64)
-          .Case("arm", Triple::arm)
-          .Case("armeb", Triple::armeb)
-          .Case("thumb", Triple::thumb)
-          .Case("thumbeb", Triple::thumbeb)
-          .Case("avr", Triple::avr)
-          .Case("m68k", Triple::m68k)
-          .Case("msp430", Triple::msp430)
-          .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6", "mipsr6",
-                 Triple::mips)
-          .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el",
-                 Triple::mipsel)
-          .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6", "mips64r6",
-                 "mipsn32r6", Triple::mips64)
-          .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
-                 "mipsn32r6el", Triple::mips64el)
-          .Case("r600", Triple::r600)
-          .Case("amdgcn", Triple::amdgcn)
-          .Case("riscv32", Triple::riscv32)
-          .Case("riscv64", Triple::riscv64)
-          .Case("hexagon", Triple::hexagon)
-          .Cases("s390x", "systemz", Triple::systemz)
-          .Case("sparc", Triple::sparc)
-          .Case("sparcel", Triple::sparcel)
-          .Cases("sparcv9", "sparc64", Triple::sparcv9)
-          .Case("tce", Triple::tce)
-          .Case("tcele", Triple::tcele)
-          .Case("xcore", Triple::xcore)
-          .Case("nvptx", Triple::nvptx)
-          .Case("nvptx64", Triple::nvptx64)
-          .Case("amdil", Triple::amdil)
-          .Case("amdil64", Triple::amdil64)
-          .Case("hsail", Triple::hsail)
-          .Case("hsail64", Triple::hsail64)
-          .Case("spir", Triple::spir)
-          .Case("spir64", Triple::spir64)
-          .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv)
-          .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
-            "spirv32v1.3", "spirv32v1.4", "spirv32v1.5",
-            "spirv32v1.6", Triple::spirv32)
-          .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
-            "spirv64v1.3", "spirv64v1.4", "spirv64v1.5",
-            "spirv64v1.6", Triple::spirv64)
-          .StartsWith("kalimba", Triple::kalimba)
-          .Case("lanai", Triple::lanai)
-          .Case("renderscript32", Triple::renderscript32)
-          .Case("renderscript64", Triple::renderscript64)
-          .Case("shave", Triple::shave)
-          .Case("ve", Triple::ve)
-          .Case("wasm32", Triple::wasm32)
-          .Case("wasm64", Triple::wasm64)
-          .Case("csky", Triple::csky)
-          .Case("loongarch32", Triple::loongarch32)
-          .Case("loongarch64", Triple::loongarch64)
-          .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
-                 "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8",
-                 Triple::dxil)
-          .Case("xtensa", Triple::xtensa)
-          .Default(Triple::UnknownArch);
+  auto AT = StringSwitch<Triple::ArchType>(ArchName)
+                .Cases("i386", "i486", "i586", "i686", Triple::x86)
+                // FIXME: Do we need to support these?
+                .Cases("i786", "i886", "i986", Triple::x86)
+                .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
+                .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
+                .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle)
+                .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
+                .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
+                .Case("xscale", Triple::arm)
+                .Case("xscaleeb", Triple::armeb)
+                .Case("aarch64", Triple::aarch64)
+                .Case("aarch64_be", Triple::aarch64_be)
+                .Case("aarch64_32", Triple::aarch64_32)
+                .Case("arc", Triple::arc)
+                .Case("arm64", Triple::aarch64)
+                .Case("arm64_32", Triple::aarch64_32)
+                .Case("arm64e", Triple::aarch64)
+                .Case("arm64ec", Triple::aarch64)
+                .Case("arm", Triple::arm)
+                .Case("armeb", Triple::armeb)
+                .Case("thumb", Triple::thumb)
+                .Case("thumbeb", Triple::thumbeb)
+                .Case("avr", Triple::avr)
+                .Case("m68k", Triple::m68k)
+                .Case("msp430", Triple::msp430)
+                .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6",
+                       "mipsr6", Triple::mips)
+                .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el",
+                       Triple::mipsel)
+                .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6",
+                       "mips64r6", "mipsn32r6", Triple::mips64)
+                .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
+                       "mipsn32r6el", Triple::mips64el)
+                .Case("r600", Triple::r600)
+                .Case("amdgcn", Triple::amdgcn)
+                .Case("riscv32", Triple::riscv32)
+                .Case("riscv64", Triple::riscv64)
+                .Case("riscv32be", Triple::riscv32be)
+                .Case("riscv64be", Triple::riscv64be)
+                .Case("hexagon", Triple::hexagon)
+                .Cases("s390x", "systemz", Triple::systemz)
+                .Case("sparc", Triple::sparc)
+                .Case("sparcel", Triple::sparcel)
+                .Cases("sparcv9", "sparc64", Triple::sparcv9)
+                .Case("tce", Triple::tce)
+                .Case("tcele", Triple::tcele)
+                .Case("xcore", Triple::xcore)
+                .Case("nvptx", Triple::nvptx)
+                .Case("nvptx64", Triple::nvptx64)
+                .Case("amdil", Triple::amdil)
+                .Case("amdil64", Triple::amdil64)
+                .Case("hsail", Triple::hsail)
+                .Case("hsail64", Triple::hsail64)
+                .Case("spir", Triple::spir)
+                .Case("spir64", Triple::spir64)
+                .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv)
+                .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
+                       "spirv32v1.3", "spirv32v1.4", "spirv32v1.5",
+                       "spirv32v1.6", Triple::spirv32)
+                .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
+                       "spirv64v1.3", "spirv64v1.4", "spirv64v1.5",
+                       "spirv64v1.6", Triple::spirv64)
+                .StartsWith("kalimba", Triple::kalimba)
+                .Case("lanai", Triple::lanai)
+                .Case("renderscript32", Triple::renderscript32)
+                .Case("renderscript64", Triple::renderscript64)
+                .Case("shave", Triple::shave)
+                .Case("ve", Triple::ve)
+                .Case("wasm32", Triple::wasm32)
+                .Case("wasm64", Triple::wasm64)
+                .Case("csky", Triple::csky)
+                .Case("loongarch32", Triple::loongarch32)
+                .Case("loongarch64", Triple::loongarch64)
+                .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
+                       "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7",
+                       "dxilv1.8", Triple::dxil)
+                .Case("xtensa", Triple::xtensa)
+                .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
   // ArchType result.
@@ -693,7 +702,6 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("zos", Triple::ZOS)
     .StartsWith("haiku", Triple::Haiku)
     .StartsWith("rtems", Triple::RTEMS)
-    .StartsWith("nacl", Triple::NaCl)
     .StartsWith("aix", Triple::AIX)
     .StartsWith("cuda", Triple::CUDA)
     .StartsWith("nvcl", Triple::NVCL)
@@ -968,6 +976,8 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::renderscript64:
   case Triple::riscv32:
   case Triple::riscv64:
+  case Triple::riscv32be:
+  case Triple::riscv64be:
   case Triple::shave:
   case Triple::sparc:
   case Triple::sparcel:
@@ -1690,6 +1700,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::r600:
   case llvm::Triple::renderscript32:
   case llvm::Triple::riscv32:
+  case llvm::Triple::riscv32be:
   case llvm::Triple::shave:
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
@@ -1720,6 +1731,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::ppc64le:
   case llvm::Triple::renderscript64:
   case llvm::Triple::riscv64:
+  case llvm::Triple::riscv64be:
   case llvm::Triple::sparcv9:
   case llvm::Triple::spirv:
   case llvm::Triple::spir64:
@@ -1798,6 +1810,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::r600:
   case Triple::renderscript32:
   case Triple::riscv32:
+  case Triple::riscv32be:
   case Triple::shave:
   case Triple::sparc:
   case Triple::sparcel:
@@ -1830,6 +1843,9 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::ppc64le:        T.setArch(Triple::ppcle);   break;
   case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
   case Triple::riscv64:        T.setArch(Triple::riscv32); break;
+  case Triple::riscv64be:
+    T.setArch(Triple::riscv32be);
+    break;
   case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
   case Triple::spir64:         T.setArch(Triple::spir);    break;
   case Triple::spirv:
@@ -1880,6 +1896,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::ppc64le:
   case Triple::renderscript64:
   case Triple::riscv64:
+  case Triple::riscv64be:
   case Triple::sparcv9:
   case Triple::spir64:
   case Triple::spirv64:
@@ -1907,6 +1924,9 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::ppcle:           T.setArch(Triple::ppc64le);    break;
   case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
   case Triple::riscv32:         T.setArch(Triple::riscv64);    break;
+  case Triple::riscv32be:
+    T.setArch(Triple::riscv64be);
+    break;
   case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
   case Triple::spir:            T.setArch(Triple::spir64);     break;
   case Triple::spirv:
@@ -1945,8 +1965,6 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::r600:
   case Triple::renderscript32:
   case Triple::renderscript64:
-  case Triple::riscv32:
-  case Triple::riscv64:
   case Triple::shave:
   case Triple::spir64:
   case Triple::spir:
@@ -1979,6 +1997,12 @@ Triple Triple::getBigEndianArchVariant() const {
     break;
   case Triple::ppcle:   T.setArch(Triple::ppc);        break;
   case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
+  case Triple::riscv32:
+    T.setArch(Triple::riscv32be);
+    break;
+  case Triple::riscv64:
+    T.setArch(Triple::riscv64be);
+    break;
   case Triple::sparcel: T.setArch(Triple::sparc);      break;
   case Triple::tcele:   T.setArch(Triple::tce);        break;
   default:
@@ -2016,6 +2040,12 @@ Triple Triple::getLittleEndianArchVariant() const {
     break;
   case Triple::ppc:        T.setArch(Triple::ppcle);    break;
   case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
+  case Triple::riscv32be:
+    T.setArch(Triple::riscv32);
+    break;
+  case Triple::riscv64be:
+    T.setArch(Triple::riscv64);
+    break;
   case Triple::sparc:      T.setArch(Triple::sparcel);  break;
   case Triple::tce:        T.setArch(Triple::tcele);    break;
   default:
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index fe30c6dc6abe4..fbeb7218ba9a3 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1179,6 +1179,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
       AllocaInst *Alloca = P.Alloca;
       auto *G = GetFramePointer(Alloca);
 
+      // Remove any lifetime intrinsics, now that these are no longer allocas.
+      for (User *U : make_early_inc_range(Alloca->users())) {
+        auto *I = cast<Instruction>(U);
+        if (I->isLifetimeStartOrEnd())
+          I->eraseFromParent();
+      }
+
       // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
       // here, as we are changing location of the instruction.
       G->takeName(Alloca);
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index f1e17ff01a4a2..5a8a41f0dc432 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -808,15 +808,14 @@ static void updateScopeLine(Instruction *ActiveSuspend,
     return;
 
   // No subsequent instruction -> fallback to the location of ActiveSuspend.
-  if (!ActiveSuspend->getNextNonDebugInstruction()) {
+  if (!ActiveSuspend->getNextNode()) {
     if (auto DL = ActiveSuspend->getDebugLoc())
       if (SPToUpdate.getFile() == DL->getFile())
         SPToUpdate.setScopeLine(DL->getLine());
     return;
   }
 
-  BasicBlock::iterator Successor =
-      ActiveSuspend->getNextNonDebugInstruction()->getIterator();
+  BasicBlock::iterator Successor = ActiveSuspend->getNextNode()->getIterator();
   // Corosplit splits the BB around ActiveSuspend, so the meaningful
   // instructions are not in the same BB.
   if (auto *Branch = dyn_cast_or_null<BranchInst>(Successor);
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 448e1b4982b74..077d29f7499a4 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -793,7 +793,7 @@ isPotentiallyReachable(Attributor &A, const Instruction &FromI,
       if (isa<InvokeInst>(CB))
         return false;
 
-      Instruction *Inst = CB->getNextNonDebugInstruction();
+      Instruction *Inst = CB->getNextNode();
       Worklist.push_back(Inst);
       return true;
     };
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index ed2ac4dbfeecd..3c24d2eca647d 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1776,7 +1776,7 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
         do {
           if (FromI->mayWriteToMemory() && !IsAssumption(*FromI))
             return true;
-          FromI = FromI->getNextNonDebugInstruction();
+          FromI = FromI->getNextNode();
         } while (FromI && FromI != ToI);
         return false;
       };
@@ -1787,7 +1787,7 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
           return false;
         BasicBlock *IntrBB = IntrI.getParent();
         if (IntrI.getParent() == BB) {
-          if (IsImpactedInRange(LoadI->getNextNonDebugInstruction(), &IntrI))
+          if (IsImpactedInRange(LoadI->getNextNode(), &IntrI))
             return false;
         } else {
           auto PredIt = pred_begin(IntrBB);
@@ -1804,8 +1804,7 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
               continue;
             return false;
           }
-          if (IsImpactedInRange(LoadI->getNextNonDebugInstruction(),
-                                BB->getTerminator()))
+          if (IsImpactedInRange(LoadI->getNextNode(), BB->getTerminator()))
             return false;
           if (IsImpactedInRange(&IntrBB->front(), &IntrI))
             return false;
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b1c0e9910f378..2623be3e88fc0 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1878,7 +1878,7 @@ static void RemovePreallocated(Function *F) {
 
     Builder.SetInsertPoint(PreallocatedSetup);
     auto *StackSave = Builder.CreateStackSave();
-    Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction());
+    Builder.SetInsertPoint(NewCB->getNextNode());
     Builder.CreateStackRestore(StackSave);
 
     // Replace @llvm.call.preallocated.arg() with alloca.
@@ -1902,7 +1902,7 @@ static void RemovePreallocated(Function *F) {
         auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
         auto *ArgType =
             UseCall->getFnAttr(Attribute::Preallocated).getValueAsType();
-        auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
+        auto *InsertBefore = PreallocatedSetup->getNextNode();
         Builder.SetInsertPoint(InsertBefore);
         auto *Alloca =
             Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg");
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 8d6ff72fa6061..c57981ae4ca0d 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -267,8 +267,7 @@ void OutlinableRegion::splitCandidate() {
   // region is the same as the recorded instruction following the last
   // instruction. If they do not match, there could be problems in rewriting
   // the program after outlining, so we ignore it.
-  if (!BackInst->isTerminator() &&
-      EndInst != BackInst->getNextNonDebugInstruction())
+  if (!BackInst->isTerminator() && EndInst != BackInst->getNextNode())
     return;
 
   Instruction *StartInst = (*Candidate->begin()).Inst;
@@ -2326,7 +2325,7 @@ static bool nextIRInstructionDataMatchesNextInst(IRInstructionData &ID) {
   Instruction *NextIDLInst = NextIDIt->Inst;
   Instruction *NextModuleInst = nullptr;
   if (!ID.Inst->isTerminator())
-    NextModuleInst = ID.Inst->getNextNonDebugInstruction();
+    NextModuleInst = ID.Inst->getNextNode();
   else if (NextIDLInst != nullptr)
     NextModuleInst =
         &*NextIDIt->Inst->getParent()->instructionsWithoutDebug().begin();
@@ -2353,7 +2352,7 @@ bool IROutliner::isCompatibleWithAlreadyOutlinedCode(
   // if it does not, we fix it in the InstructionDataList.
   if (!Region.Candidate->backInstruction()->isTerminator()) {
     Instruction *NewEndInst =
-        Region.Candidate->backInstruction()->getNextNonDebugInstruction();
+        Region.Candidate->backInstruction()->getNextNode();
     assert(NewEndInst && "Next instruction is a nullptr?");
     if (Region.Candidate->end()->Inst != NewEndInst) {
       IRInstructionDataList *IDL = Region.Candidate->front()->IDL;
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index dd7ae7e66e350..5e2247f2a88d0 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2856,7 +2856,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
       if (!It->getSecond().IsReachingAlignedBarrierOnly)
         ForwardIsOk = false;
       break;
-    } while ((CurI = CurI->getNextNonDebugInstruction()));
+    } while ((CurI = CurI->getNextNode()));
 
     if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly)
       ForwardIsOk = false;
@@ -2875,7 +2875,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
       if (It->getSecond().IsReachedFromAlignedBarrierOnly)
         break;
       return false;
-    } while ((CurI = CurI->getPrevNonDebugInstruction()));
+    } while ((CurI = CurI->getPrevNode()));
 
     // Delayed decision on the forward pass to allow aligned barrier detection
     // in the backwards traversal.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e521c9d7001ac..3321435a6fecb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3298,7 +3298,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // Remove an assume if it is followed by an identical assume.
     // TODO: Do we need this? Unless there are conflicting assumptions, the
     // computeKnownBits(IIOperand) below here eliminates redundant assumes.
-    Instruction *Next = II->getNextNonDebugInstruction();
+    Instruction *Next = II->getNextNode();
     if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
       return RemoveConditionFromAssume(Next);
 
@@ -3471,12 +3471,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // Is this guard followed by another guard?  We scan forward over a small
     // fixed window of instructions to handle common cases with conditions
     // computed between guards.
-    Instruction *NextInst = II->getNextNonDebugInstruction();
+    Instruction *NextInst = II->getNextNode();
     for (unsigned i = 0; i < GuardWideningWindow; i++) {
       // Note: Using context-free form to avoid compile time blow up
       if (!isSafeToSpeculativelyExecute(NextInst))
         break;
-      NextInst = NextInst->getNextNonDebugInstruction();
+      NextInst = NextInst->getNextNode();
     }
     Value *NextCond = nullptr;
     if (match(NextInst,
@@ -3486,10 +3486,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       // Remove a guard that it is immediately preceded by an identical guard.
       // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
       if (CurrCond != NextCond) {
-        Instruction *MoveI = II->getNextNonDebugInstruction();
+        Instruction *MoveI = II->getNextNode();
         while (MoveI != NextInst) {
           auto *Temp = MoveI;
-          MoveI = MoveI->getNextNonDebugInstruction();
+          MoveI = MoveI->getNextNode();
           Temp->moveBefore(II->getIterator());
         }
         replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond));
@@ -3913,7 +3913,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
 // Fence instruction simplification
 Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) {
-  auto *NFI = dyn_cast<FenceInst>(FI.getNextNonDebugInstruction());
+  auto *NFI = dyn_cast<FenceInst>(FI.getNextNode());
   // This check is solely here to handle arbitrary target-dependent syncscopes.
   // TODO: Can remove if does not matter in practice.
   if (NFI && FI.isIdenticalTo(NFI))
@@ -3933,7 +3933,7 @@ Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) {
   if (NFI && isIdenticalOrStrongerFence(NFI, &FI))
     return eraseInstFromFunction(FI);
 
-  if (auto *PFI = dyn_cast_or_null<FenceInst>(FI.getPrevNonDebugInstruction()))
+  if (auto *PFI = dyn_cast_or_null<FenceInst>(FI.getPrevNode()))
     if (isIdenticalOrStrongerFence(PFI, &FI))
       return eraseInstFromFunction(FI);
   return nullptr;
@@ -4050,6 +4050,111 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
   return nullptr;
 }
 
+Instruction *InstCombinerImpl::foldPtrAuthIntrinsicCallee(CallBase &Call) {
+  const Value *Callee = Call.getCalledOperand();
+  const auto *IPC = dyn_cast<IntToPtrInst>(Callee);
+  if (!IPC || !IPC->isNoopCast(DL))
+    return nullptr;
+
+  const auto *II = dyn_cast<IntrinsicInst>(IPC->getOperand(0));
+  if (!II)
+    return nullptr;
+
+  Intrinsic::ID IIID = II->getIntrinsicID();
+  if (IIID != Intrinsic::ptrauth_resign && IIID != Intrinsic::ptrauth_sign)
+    return nullptr;
+
+  // Isolate the ptrauth bundle from the others.
+  std::optional<OperandBundleUse> PtrAuthBundleOrNone;
+  SmallVector<OperandBundleDef, 2> NewBundles;
+  for (unsigned BI = 0, BE = Call.getNumOperandBundles(); BI != BE; ++BI) {
+    OperandBundleUse Bundle = Call.getOperandBundleAt(BI);
+    if (Bundle.getTagID() == LLVMContext::OB_ptrauth)
+      PtrAuthBundleOrNone = Bundle;
+    else
+      NewBundles.emplace_back(Bundle);
+  }
+
+  if (!PtrAuthBundleOrNone)
+    return nullptr;
+
+  Value *NewCallee = nullptr;
+  switch (IIID) {
+  // call(ptrauth.resign(p)), ["ptrauth"()] ->  call p, ["ptrauth"()]
+  // assuming the call bundle and the sign operands match.
+  case Intrinsic::ptrauth_resign: {
+    // Resign result key should match bundle.
+    if (II->getOperand(3) != PtrAuthBundleOrNone->Inputs[0])
+      return nullptr;
+    // Resign result discriminator should match bundle.
+    if (II->getOperand(4) != PtrAuthBundleOrNone->Inputs[1])
+      return nullptr;
+
+    // Resign input (auth) key should also match: we can't change the key on
+    // the new call we're generating, because we don't know what keys are valid.
+    if (II->getOperand(1) != PtrAuthBundleOrNone->Inputs[0])
+      return nullptr;
+
+    Value *NewBundleOps[] = {II->getOperand(1), II->getOperand(2)};
+    NewBundles.emplace_back("ptrauth", NewBundleOps);
+    NewCallee = II->getOperand(0);
+    break;
+  }
+
+  // call(ptrauth.sign(p)), ["ptrauth"()] ->  call p
+  // assuming the call bundle and the sign operands match.
+  // Non-ptrauth indirect calls are undesirable, but so is ptrauth.sign.
+  case Intrinsic::ptrauth_sign: {
+    // Sign key should match bundle.
+    if (II->getOperand(1) != PtrAuthBundleOrNone->Inputs[0])
+      return nullptr;
+    // Sign discriminator should match bundle.
+    if (II->getOperand(2) != PtrAuthBundleOrNone->Inputs[1])
+      return nullptr;
+    NewCallee = II->getOperand(0);
+    break;
+  }
+  default:
+    llvm_unreachable("unexpected intrinsic ID");
+  }
+
+  if (!NewCallee)
+    return nullptr;
+
+  NewCallee = Builder.CreateBitOrPointerCast(NewCallee, Callee->getType());
+  CallBase *NewCall = CallBase::Create(&Call, NewBundles);
+  NewCall->setCalledOperand(NewCallee);
+  return NewCall;
+}
+
+Instruction *InstCombinerImpl::foldPtrAuthConstantCallee(CallBase &Call) {
+  auto *CPA = dyn_cast<ConstantPtrAuth>(Call.getCalledOperand());
+  if (!CPA)
+    return nullptr;
+
+  auto *CalleeF = dyn_cast<Function>(CPA->getPointer());
+  // If the ptrauth constant isn't based on a function pointer, bail out.
+  if (!CalleeF)
+    return nullptr;
+
+  // Inspect the call ptrauth bundle to check it matches the ptrauth constant.
+  auto PAB = Call.getOperandBundle(LLVMContext::OB_ptrauth);
+  if (!PAB)
+    return nullptr;
+
+  auto *Key = cast<ConstantInt>(PAB->Inputs[0]);
+  Value *Discriminator = PAB->Inputs[1];
+
+  // If the bundle doesn't match, this is probably going to fail to auth.
+  if (!CPA->isKnownCompatibleWith(Key, Discriminator, DL))
+    return nullptr;
+
+  // If the bundle matches the constant, proceed in making this a direct call.
+  auto *NewCall = CallBase::removeOperandBundle(&Call, LLVMContext::OB_ptrauth);
+  NewCall->setCalledOperand(CalleeF);
+  return NewCall;
+}
+
 bool InstCombinerImpl::annotateAnyAllocSite(CallBase &Call,
                                             const TargetLibraryInfo *TLI) {
   // Note: We only handle cases which can't be driven from generic attributes
@@ -4210,6 +4315,14 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
   if (IntrinsicInst *II = findInitTrampoline(Callee))
     return transformCallThroughTrampoline(Call, *II);
 
+  // Combine calls involving pointer authentication intrinsics.
+  if (Instruction *NewCall = foldPtrAuthIntrinsicCallee(Call))
+    return NewCall;
+
+  // Combine calls to ptrauth constants.
+  if (Instruction *NewCall = foldPtrAuthConstantCallee(Call))
+    return NewCall;
+
   if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) {
     InlineAsm *IA = cast<InlineAsm>(Callee);
     if (!IA->canThrow()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c6f317a668cfe..9df08553d86e4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4305,10 +4305,6 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
     return nullptr;
 
   switch (LHSI->getOpcode()) {
-  case Instruction::PHI:
-    if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
-      return NV;
-    break;
   case Instruction::IntToPtr:
     // icmp pred inttoptr(X), null -> icmp pred X, 0
     if (RHSC->isNullValue() &&
@@ -7699,6 +7695,13 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *New = foldSignBitTest(I))
     return New;
 
+  if (auto *PN = dyn_cast<PHINode>(Op0))
+    if (Instruction *NV = foldOpIntoPhi(I, PN))
+      return NV;
+  if (auto *PN = dyn_cast<PHINode>(Op1))
+    if (Instruction *NV = foldOpIntoPhi(I, PN))
+      return NV;
+
   if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
     return Res;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 1b963952f1d61..f7fbf0815df03 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -283,6 +283,19 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *transformCallThroughTrampoline(CallBase &Call,
                                               IntrinsicInst &Tramp);
 
+  /// Try to optimize a call to the result of a ptrauth intrinsic, potentially
+  /// into the ptrauth call bundle:
+  /// - call(ptrauth.resign(p)), ["ptrauth"()] ->  call p, ["ptrauth"()]
+  /// - call(ptrauth.sign(p)),   ["ptrauth"()] ->  call p
+  /// as long as the key/discriminator are the same in sign and auth-bundle,
+  /// and we don't change the key in the bundle (to a potentially-invalid key.)
+  Instruction *foldPtrAuthIntrinsicCallee(CallBase &Call);
+
+  /// Try to optimize a call to a ptrauth constant, into its ptrauth bundle:
+  ///   call(ptrauth(f)), ["ptrauth"()] ->  call f
+  /// as long as the key/discriminator are the same in constant and bundle.
+  Instruction *foldPtrAuthConstantCallee(CallBase &Call);
+
   // Return (a, b) if (LHS, RHS) is known to be (a, b) or (b, a).
   // Otherwise, return std::nullopt
   // Currently it matches:
@@ -812,9 +825,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
 
   bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock);
-  void tryToSinkInstructionDbgValues(
-      Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-      BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers);
   void tryToSinkInstructionDbgVariableRecords(
       Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
       BasicBlock *DestBlock, SmallVectorImpl<DbgVariableRecord *> &DPUsers);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e791bc4b56e52..2cc1bc9fa4a67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -277,6 +277,15 @@ bool PointerReplacer::collectUsers() {
           Worklist.emplace_back(I);
   };
 
+  auto TryPushInstOperand = [&](Instruction *InstOp) {
+    if (!UsersToReplace.contains(InstOp)) {
+      if (!ValuesToRevisit.insert(InstOp))
+        return false;
+      Worklist.emplace_back(InstOp);
+    }
+    return true;
+  };
+
   PushUsersToWorklist(&Root);
   while (!Worklist.empty()) {
     Instruction *Inst = Worklist.pop_back_val();
@@ -309,12 +318,8 @@ bool PointerReplacer::collectUsers() {
       // incoming values.
       Worklist.emplace_back(PHI);
       for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
-        auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
-        if (UsersToReplace.contains(IncomingValue))
-          continue;
-        if (!ValuesToRevisit.insert(IncomingValue))
+        if (!TryPushInstOperand(cast<Instruction>(PHI->getIncomingValue(Idx))))
           return false;
-        Worklist.emplace_back(IncomingValue);
       }
     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
       auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
@@ -322,8 +327,17 @@ bool PointerReplacer::collectUsers() {
       if (!TrueInst || !FalseInst)
         return false;
 
-      UsersToReplace.insert(SI);
-      PushUsersToWorklist(SI);
+      if (isAvailable(TrueInst) && isAvailable(FalseInst)) {
+        UsersToReplace.insert(SI);
+        PushUsersToWorklist(SI);
+        continue;
+      }
+
+      // Push select back onto the stack, followed by unavailable true/false
+      // value.
+      Worklist.emplace_back(SI);
+      if (!TryPushInstOperand(TrueInst) || !TryPushInstOperand(FalseInst))
+        return false;
     } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       UsersToReplace.insert(GEP);
       PushUsersToWorklist(GEP);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 73ba0f78e8053..eb4332fbc0959 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -878,7 +878,11 @@ static Instruction *foldSetClearBits(SelectInst &Sel,
 // is a vector consisting of 0 and undefs. If a constant compared with x
 // is a scalar undefined value or undefined vector then an expression
 // should be already folded into a constant.
-static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
+//
+// This also holds all operations such that Op(0) == 0
+// e.g. Shl, Umin, etc
+static Instruction *foldSelectZeroOrFixedOp(SelectInst &SI,
+                                            InstCombinerImpl &IC) {
   auto *CondVal = SI.getCondition();
   auto *TrueVal = SI.getTrueValue();
   auto *FalseVal = SI.getFalseValue();
@@ -900,10 +904,23 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
   // non-zero elements that are masked by undef elements in the compare
   // constant.
   auto *TrueValC = dyn_cast<Constant>(TrueVal);
-  if (TrueValC == nullptr ||
-      !match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) ||
-      !isa<Instruction>(FalseVal))
+  if (TrueValC == nullptr || !isa<Instruction>(FalseVal))
+    return nullptr;
+
+  bool FreezeY;
+  if (match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) ||
+      match(FalseVal, m_c_And(m_Specific(X), m_Value(Y))) ||
+      match(FalseVal, m_FShl(m_Specific(X), m_Specific(X), m_Value(Y))) ||
+      match(FalseVal, m_FShr(m_Specific(X), m_Specific(X), m_Value(Y))) ||
+      match(FalseVal,
+            m_c_Intrinsic<Intrinsic::umin>(m_Specific(X), m_Value(Y)))) {
+    FreezeY = true;
+  } else if (match(FalseVal, m_IDiv(m_Specific(X), m_Value(Y))) ||
+             match(FalseVal, m_IRem(m_Specific(X), m_Value(Y)))) {
+    FreezeY = false;
+  } else {
     return nullptr;
+  }
 
   auto *ZeroC = cast<Constant>(cast<Instruction>(CondVal)->getOperand(1));
   auto *MergedC = Constant::mergeUndefsWith(TrueValC, ZeroC);
@@ -914,9 +931,15 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) {
     return nullptr;
 
   auto *FalseValI = cast<Instruction>(FalseVal);
-  auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"),
-                                     FalseValI->getIterator());
-  IC.replaceOperand(*FalseValI, FalseValI->getOperand(0) == Y ? 0 : 1, FrY);
+  if (FreezeY) {
+    auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"),
+                                       FalseValI->getIterator());
+    IC.replaceOperand(*FalseValI,
+                      FalseValI->getOperand(0) == Y
+                          ? 0
+                          : (FalseValI->getOperand(1) == Y ? 1 : 2),
+                      FrY);
+  }
   return IC.replaceInstUsesWith(SI, FalseValI);
 }
 
@@ -4104,7 +4127,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     return Add;
   if (Instruction *Or = foldSetClearBits(SI, Builder))
     return Or;
-  if (Instruction *Mul = foldSelectZeroOrMul(SI, *this))
+  if (Instruction *Mul = foldSelectZeroOrFixedOp(SI, *this))
     return Mul;
 
   // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 91a1b61ddc483..a8bfd8c072d2f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3575,6 +3575,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   std::unique_ptr<DIBuilder> DIB;
   if (isa<AllocaInst>(MI)) {
     findDbgUsers(DVIs, &MI, &DVRs);
+    assert(DVIs.empty());
     DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
   }
 
@@ -3644,9 +3645,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
                             ConstantInt::get(Type::getInt1Ty(C->getContext()),
                                              C->isFalseWhenEqual()));
       } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-        for (auto *DVI : DVIs)
-          if (DVI->isAddressOfVariable())
-            ConvertDebugDeclareToDebugValue(DVI, SI, *DIB);
         for (auto *DVR : DVRs)
           if (DVR->isAddressOfVariable())
             ConvertDebugDeclareToDebugValue(DVR, SI, *DIB);
@@ -3890,7 +3888,7 @@ bool InstCombinerImpl::removeInstructionsBeforeUnreachable(Instruction &I) {
   // This includes instructions like stores and "llvm.assume" that may not get
   // removed by simple dead code elimination.
   bool Changed = false;
-  while (Instruction *Prev = I.getPrevNonDebugInstruction()) {
+  while (Instruction *Prev = I.getPrevNode()) {
     // While we theoretically can erase EH, that would result in a block that
     // used to start with an EH no longer starting with EH, which is invalid.
     // To make it valid, we'd need to fixup predecessors to no longer refer to
@@ -4899,13 +4897,14 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
   // If operand is guaranteed not to be poison, there is no need to add freeze
   // to the operand. So we first find the operand that is not guaranteed to be
   // poison.
-  Use *MaybePoisonOperand = nullptr;
-  for (Use &U : OrigOpInst->operands()) {
-    if (isa<MetadataAsValue>(U.get()) ||
-        isGuaranteedNotToBeUndefOrPoison(U.get()))
+  Value *MaybePoisonOperand = nullptr;
+  for (Value *V : OrigOpInst->operands()) {
+    if (isa<MetadataAsValue>(V) || isGuaranteedNotToBeUndefOrPoison(V) ||
+        // Treat identical operands as a single operand.
+        (MaybePoisonOperand && MaybePoisonOperand == V))
       continue;
     if (!MaybePoisonOperand)
-      MaybePoisonOperand = &U;
+      MaybePoisonOperand = V;
     else
       return nullptr;
   }
@@ -4917,10 +4916,10 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {
     return OrigOp;
 
   Builder.SetInsertPoint(OrigOpInst);
-  auto *FrozenMaybePoisonOperand = Builder.CreateFreeze(
-      MaybePoisonOperand->get(), MaybePoisonOperand->get()->getName() + ".fr");
+  Value *FrozenMaybePoisonOperand = Builder.CreateFreeze(
+      MaybePoisonOperand, MaybePoisonOperand->getName() + ".fr");
 
-  replaceUse(*MaybePoisonOperand, FrozenMaybePoisonOperand);
+  OrigOpInst->replaceUsesOfWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
   return OrigOp;
 }
 
@@ -5255,8 +5254,7 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
   SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
   SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
   findDbgUsers(DbgUsers, I, &DbgVariableRecords);
-  if (!DbgUsers.empty())
-    tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers);
+  assert(DbgUsers.empty());
   if (!DbgVariableRecords.empty())
     tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock,
                                            DbgVariableRecords);
@@ -5273,71 +5271,12 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
   return true;
 }
 
-void InstCombinerImpl::tryToSinkInstructionDbgValues(
-    Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers) {
-  // For all debug values in the destination block, the sunk instruction
-  // will still be available, so they do not need to be dropped.
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSalvage;
-  for (auto &DbgUser : DbgUsers)
-    if (DbgUser->getParent() != DestBlock)
-      DbgUsersToSalvage.push_back(DbgUser);
-
-  // Process the sinking DbgUsersToSalvage in reverse order, as we only want
-  // to clone the last appearing debug intrinsic for each given variable.
-  SmallVector<DbgVariableIntrinsic *, 2> DbgUsersToSink;
-  for (DbgVariableIntrinsic *DVI : DbgUsersToSalvage)
-    if (DVI->getParent() == SrcBlock)
-      DbgUsersToSink.push_back(DVI);
-  llvm::sort(DbgUsersToSink,
-             [](auto *A, auto *B) { return B->comesBefore(A); });
-
-  SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
-  SmallSet<DebugVariable, 4> SunkVariables;
-  for (auto *User : DbgUsersToSink) {
-    // A dbg.declare instruction should not be cloned, since there can only be
-    // one per variable fragment. It should be left in the original place
-    // because the sunk instruction is not an alloca (otherwise we could not be
-    // here).
-    if (isa<DbgDeclareInst>(User))
-      continue;
-
-    DebugVariable DbgUserVariable =
-        DebugVariable(User->getVariable(), User->getExpression(),
-                      User->getDebugLoc()->getInlinedAt());
-
-    if (!SunkVariables.insert(DbgUserVariable).second)
-      continue;
-
-    // Leave dbg.assign intrinsics in their original positions and there should
-    // be no need to insert a clone.
-    if (isa<DbgAssignIntrinsic>(User))
-      continue;
-
-    DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
-    if (isa<DbgDeclareInst>(User) && isa<CastInst>(I))
-      DIIClones.back()->replaceVariableLocationOp(I, I->getOperand(0));
-    LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
-  }
-
-  // Perform salvaging without the clones, then sink the clones.
-  if (!DIIClones.empty()) {
-    salvageDebugInfoForDbgValues(*I, DbgUsersToSalvage, {});
-    // The clones are in reverse order of original appearance, reverse again to
-    // maintain the original order.
-    for (auto &DIIClone : llvm::reverse(DIIClones)) {
-      DIIClone->insertBefore(InsertPos);
-      LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
-    }
-  }
-}
-
 void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords(
     Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
     BasicBlock *DestBlock,
     SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
-  // Implementation of tryToSinkInstructionDbgValues, but for the
-  // DbgVariableRecord of variable assignments rather than dbg.values.
+  // For all debug values in the destination block, the sunk instruction
+  // will still be available, so they do not need to be dropped.
 
   // Fetch all DbgVariableRecords not already in the destination.
   SmallVector<DbgVariableRecord *, 2> DbgVariableRecordsToSalvage;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 840a5e3f31dfd..5957940add577 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -3396,8 +3396,8 @@ void FunctionStackPoisoner::processDynamicAllocas() {
 static void findStoresToUninstrumentedArgAllocas(
     AddressSanitizer &ASan, Instruction &InsBefore,
     SmallVectorImpl<Instruction *> &InitInsts) {
-  Instruction *Start = InsBefore.getNextNonDebugInstruction();
-  for (Instruction *It = Start; It; It = It->getNextNonDebugInstruction()) {
+  Instruction *Start = InsBefore.getNextNode();
+  for (Instruction *It = Start; It; It = It->getNextNode()) {
     // Argument initialization looks like:
     // 1) store <Argument>, <Alloca> OR
     // 2) <CastArgument> = cast <Argument> to ...
@@ -3424,7 +3424,7 @@ static void findStoresToUninstrumentedArgAllocas(
           isa<Argument>(cast<CastInst>(Val)->getOperand(0)) &&
           // Check that the cast appears directly before the store. Otherwise
           // moving the cast before InsBefore may break the IR.
-          Val == It->getPrevNonDebugInstruction();
+          Val == It->getPrevNode();
       bool IsArgInit = IsDirectArgInit || IsArgInitViaCast;
       if (!IsArgInit)
         continue;
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 71ab61253e643..e5bf2d1187a89 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -27,7 +27,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CRC.h"
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 77db686f8229c..2c34bf2157cdd 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1421,7 +1421,7 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
 bool HWAddressSanitizer::instrumentLandingPads(
     SmallVectorImpl<Instruction *> &LandingPadVec) {
   for (auto *LP : LandingPadVec) {
-    IRBuilder<> IRB(LP->getNextNonDebugInstruction());
+    IRBuilder<> IRB(LP->getNextNode());
     IRB.CreateCall(
         HwasanHandleVfork,
         {memtag::readRegister(
@@ -1446,7 +1446,7 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
     auto N = I++;
     auto *AI = KV.first;
     memtag::AllocaInfo &Info = KV.second;
-    IRBuilder<> IRB(AI->getNextNonDebugInstruction());
+    IRBuilder<> IRB(AI->getNextNode());
 
     // Replace uses of the alloca with tagged address.
     Value *Tag = getAllocaTag(IRB, StackTag, N);
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index e7a6fa48e9004..55f3239ada100 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -184,22 +185,14 @@ void LowerAllowCheckPass::printPipeline(
   // correctness.
   // TODO: print shorter output by combining adjacent runs, etc.
   int i = 0;
-  bool printed = false;
+  ListSeparator LS(";");
   for (unsigned int cutoff : Opts.cutoffs) {
-    if (cutoff > 0) {
-      if (printed)
-        OS << ";";
-      OS << "cutoffs[" << i << "]=" << cutoff;
-      printed = true;
-    }
-
+    if (cutoff > 0)
+      OS << LS << "cutoffs[" << i << "]=" << cutoff;
     i++;
   }
-  if (Opts.runtime_check) {
-    if (printed)
-      OS << ";";
-    OS << "runtime_check=" << Opts.runtime_check;
-  }
+  if (Opts.runtime_check)
+    OS << LS << "runtime_check=" << Opts.runtime_check;
 
   OS << '>';
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 5f5200b2c9e62..7b5831652aa6e 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2509,9 +2509,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //
     //    S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
     //
-    //  Addendum if the "Or" is "disjoint":
-    //    1|1 => p;
-    //    S = S | (V1 & V2)
+    //  If the "disjoint OR" property is violated, the result is poison, and
+    //  hence the entire shadow is uninitialized:
+    //    S = S | SignExt(V1 & V2 != 0)
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
     Value *V1 = I.getOperand(0);
@@ -2532,7 +2532,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     if (ClPreciseDisjointOr && cast<PossiblyDisjointInst>(&I)->isDisjoint()) {
       Value *V1V2 = IRB.CreateAnd(V1, V2);
-      S = IRB.CreateOr(S, V1V2, "_ms_disjoint");
+      Value *DisjointOrShadow = IRB.CreateSExt(
+          IRB.CreateICmpNE(V1V2, getCleanShadow(V1V2)), V1V2->getType());
+      S = IRB.CreateOr(S, DisjointOrShadow, "_ms_disjoint");
     }
 
     setShadow(&I, S);
@@ -4322,8 +4324,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (isa<Constant>(Idx))
       return;
 
+    auto *IdxShadow = getShadow(Idx);
     Value *Truncated = IRB.CreateTrunc(
-        Idx,
+        IdxShadow,
         FixedVectorType::get(Type::getIntNTy(*MS.C, Log2_64(IdxVectorSize)),
                              IdxVectorSize));
     insertCheckShadow(Truncated, getOrigin(Idx), I);
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index b47ef8523ea11..a3d4e5367b9ab 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -102,12 +102,12 @@ public:
   void run(std::vector<CandidateInfo> &Candidates) {
     std::vector<Instruction *> Result = findVTableAddrs(F);
     for (Instruction *I : Result) {
-      Instruction *InsertPt = I->getNextNonDebugInstruction();
+      Instruction *InsertPt = I->getNextNode();
       // When finding an insertion point, keep PHI and EH pad instructions
       // before vp intrinsics. This is similar to
       // `BasicBlock::getFirstInsertionPt`.
       while (InsertPt && (dyn_cast<PHINode>(InsertPt) || InsertPt->isEHPad()))
-        InsertPt = InsertPt->getNextNonDebugInstruction();
+        InsertPt = InsertPt->getNextNode();
       // Skip instrumentating the value if InsertPt is the last instruction.
       // FIXME: Set InsertPt to the end of basic block to instrument the value
       // if InsertPt is the last instruction.
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 2786d81773ed9..df3160233c510 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1489,6 +1489,7 @@ static bool checkAndReplaceCondition(
     SmallVector<DbgVariableIntrinsic *> DbgUsers;
     SmallVector<DbgVariableRecord *> DVRUsers;
     findDbgUsers(DbgUsers, Cmp, &DVRUsers);
+    assert(DbgUsers.empty());
 
     for (auto *DVR : DVRUsers) {
       auto *DTN = DT.getNode(DVR->getParent());
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 85dd9a1bf7161..0f63ed0166cf4 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2079,6 +2079,7 @@ struct DSEState {
       AllocFnKind AllocKind =
           Attrs.getFnAttr(Attribute::AllocKind).getAllocKind() |
           AllocFnKind::Zeroed;
+      AllocKind &= ~AllocFnKind::Uninitialized;
       Attrs =
           Attrs.addFnAttribute(Ctx, Attribute::getWithAllocKind(Ctx, AllocKind))
               .removeFnAttribute(Ctx, "alloc-variant-zeroed");
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index d9d05c3e8cc49..f6bf09d09433d 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1310,7 +1310,7 @@ static Value *findDominatingValue(const MemoryLocation &Loc, Type *LoadTy,
   BatchAAResults BatchAA(*AA);
   for (BasicBlock *BB = FromBB; BB; BB = BB->getSinglePredecessor())
     for (auto *Inst = BB == FromBB ? From : BB->getTerminator();
-         Inst != nullptr; Inst = Inst->getPrevNonDebugInstruction()) {
+         Inst != nullptr; Inst = Inst->getPrevNode()) {
       // Stop the search if limit is reached.
       if (++NumVisitedInsts > MaxNumVisitedInsts)
         return nullptr;
@@ -2367,11 +2367,14 @@ uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
   // See if we can refine the value number by looking at the PN incoming value
   // for the given predecessor.
   if (PHINode *PN = NumberingPhi[Num]) {
-    if (PN->getParent() == PhiBlock)
-      for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I)
-        if (PN->getIncomingBlock(I) == Pred)
-          if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
-            return TransVal;
+    if (PN->getParent() != PhiBlock)
+      return Num;
+    for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) {
+      if (PN->getIncomingBlock(I) != Pred)
+        continue;
+      if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
+        return TransVal;
+    }
     return Num;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 2058df33ea331..a5fc0b4c6904d 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -799,7 +799,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
                                   BasicBlock *BBEnd) {
   SmallVector<Instruction *, 4> Insts;
   for (BasicBlock *BB : Blocks)
-    Insts.push_back(BB->getTerminator()->getPrevNonDebugInstruction());
+    Insts.push_back(BB->getTerminator()->getPrevNode());
   Instruction *I0 = Insts.front();
 
   SmallVector<Value *, 4> NewOperands;
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index a5008907b9014..a0f9f3c4a35a5 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -379,6 +379,10 @@ class LoopInterchangeLegality {
     return InnerLoopInductions;
   }
 
+  ArrayRef<Instruction *> getHasNoWrapReductions() const {
+    return HasNoWrapReductions;
+  }
+
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
   bool containsUnsafeInstructions(BasicBlock *BB);
@@ -405,6 +409,11 @@ class LoopInterchangeLegality {
 
   /// Set of inner loop induction PHIs
   SmallVector<PHINode *, 8> InnerLoopInductions;
+
+  /// Hold instructions that have nuw/nsw flags and involved in reductions,
+  /// like integer addition/multiplication. Those flags must be dropped when
+  /// interchanging the loops.
+  SmallVector<Instruction *, 4> HasNoWrapReductions;
 };
 
 /// Manages information utilized by the profitability check for cache. The main
@@ -473,7 +482,7 @@ class LoopInterchangeTransform {
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
 
   /// Interchange OuterLoop and InnerLoop.
-  bool transform();
+  bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
   void restructureLoops(Loop *NewInner, Loop *NewOuter,
                         BasicBlock *OrigInnerPreHeader,
                         BasicBlock *OrigOuterPreHeader);
@@ -613,7 +622,7 @@ struct LoopInterchange {
     });
 
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
-    LIT.transform();
+    LIT.transform(LIL.getHasNoWrapReductions());
     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
     LoopsInterchanged++;
 
@@ -798,7 +807,9 @@ static Value *followLCSSA(Value *SV) {
 }
 
 // Check V's users to see if it is involved in a reduction in L.
-static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
+static PHINode *
+findInnerReductionPhi(Loop *L, Value *V,
+                      SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
   // Reduction variables cannot be constants.
   if (isa<Constant>(V))
     return nullptr;
@@ -812,7 +823,66 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
         // Detect floating point reduction only when it can be reordered.
         if (RD.getExactFPMathInst() != nullptr)
           return nullptr;
-        return PHI;
+
+        RecurKind RK = RD.getRecurrenceKind();
+        switch (RK) {
+        case RecurKind::Or:
+        case RecurKind::And:
+        case RecurKind::Xor:
+        case RecurKind::SMin:
+        case RecurKind::SMax:
+        case RecurKind::UMin:
+        case RecurKind::UMax:
+        case RecurKind::FAdd:
+        case RecurKind::FMul:
+        case RecurKind::FMin:
+        case RecurKind::FMax:
+        case RecurKind::FMinimum:
+        case RecurKind::FMaximum:
+        case RecurKind::FMinimumNum:
+        case RecurKind::FMaximumNum:
+        case RecurKind::FMulAdd:
+        case RecurKind::AnyOf:
+          return PHI;
+
+        // Change the order of integer addition/multiplication may change the
+        // semantics. Consider the following case:
+        //
+        //  int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
+        //  int sum = 0;
+        //  for (int i = 0; i < 2; i++)
+        //    for (int j = 0; j < 2; j++)
+        //      sum += A[j][i];
+        //
+        // If the above loops are exchanged, the addition will cause an
+        // overflow. To prevent this, we must drop the nuw/nsw flags from the
+        // addition/multiplication instructions when we actually exchanges the
+        // loops.
+        case RecurKind::Add:
+        case RecurKind::Mul: {
+          unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
+          SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(PHI, L);
+
+          // Bail out when we fail to collect reduction instructions chain.
+          if (Ops.empty())
+            return nullptr;
+
+          for (Instruction *I : Ops) {
+            assert(I->getOpcode() == OpCode &&
+                   "Expected the instruction to be the reduction operation");
+            (void)OpCode;
+
+            // If the instruction has nuw/nsw flags, we must drop them when the
+            // transformation is actually performed.
+            if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
+              HasNoWrapInsts.push_back(I);
+          }
+          return PHI;
+        }
+
+        default:
+          return nullptr;
+        }
       }
       return nullptr;
     }
@@ -844,7 +914,8 @@ bool LoopInterchangeLegality::findInductionAndReductions(
         // Check if we have a PHI node in the outer loop that has a reduction
         // result from the inner loop as an incoming value.
         Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
-        PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
+        PHINode *InnerRedPhi =
+            findInnerReductionPhi(InnerLoop, V, HasNoWrapReductions);
         if (!InnerRedPhi ||
             !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
           LLVM_DEBUG(
@@ -1430,7 +1501,8 @@ void LoopInterchangeTransform::restructureLoops(
   SE->forgetLoop(NewOuter);
 }
 
-bool LoopInterchangeTransform::transform() {
+bool LoopInterchangeTransform::transform(
+    ArrayRef<Instruction *> DropNoWrapInsts) {
   bool Transformed = false;
 
   if (InnerLoop->getSubLoops().empty()) {
@@ -1531,6 +1603,13 @@ bool LoopInterchangeTransform::transform() {
     return false;
   }
 
+  // Finally, drop the nsw/nuw flags from the instructions for reduction
+  // calculations.
+  for (Instruction *Reduction : DropNoWrapInsts) {
+    Reduction->setHasNoSignedWrap(false);
+    Reduction->setHasNoUnsignedWrap(false);
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 32616bfe68ca0..dc8fa4379752f 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2705,7 +2705,7 @@ LSRInstance::OptimizeLoopTermCond() {
     // It's possible for the setcc instruction to be anywhere in the loop, and
     // possible for it to have multiple users.  If it is not immediately before
     // the exiting block branch, move it.
-    if (Cond->getNextNonDebugInstruction() != TermBr) {
+    if (Cond->getNextNode() != TermBr) {
       if (Cond->hasOneUse()) {
         Cond->moveBefore(TermBr->getIterator());
       } else {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index a22d84dcf014d..f7d2258e1c283 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -41,7 +41,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 2b0e221f341e0..84d1c0b765978 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -730,7 +730,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
       if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
                                 DL.getTypeStoreSize(T), BAA)) {
         // Avoid invalidating the iterator.
-        BBI = SI->getNextNonDebugInstruction()->getIterator();
+        BBI = SI->getNextNode()->getIterator();
         eraseInstruction(SI);
         eraseInstruction(LI);
         ++NumMemCpyInstr;
@@ -1863,7 +1863,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
                             TypeSize::getFixed(Len->getZExtValue()), BAA)) {
     // Avoid invalidating the iterator.
-    BBI = M->getNextNonDebugInstruction()->getIterator();
+    BBI = M->getNextNode()->getIterator();
     eraseInstruction(M);
     ++NumMemCpyInstr;
     return true;
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 70b4552190a4e..23256cf2acbd2 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -315,18 +315,11 @@ calculateFragment(DILocalVariable *Variable,
   return UseFrag;
 }
 
-static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
-  return DebugVariable(DVI->getVariable(), std::nullopt,
-                       DVI->getDebugLoc().getInlinedAt());
-}
 static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
   return DebugVariable(DVR->getVariable(), std::nullopt,
                        DVR->getDebugLoc().getInlinedAt());
 }
 
-/// Helpers for handling new and old debug info modes in migrateDebugInfo.
-/// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
-/// on the \p Unused parameter type.
 DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
   (void)Unused;
   return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
@@ -376,9 +369,6 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   /// Map of aggregate variables to their fragment associated with OldAlloca.
   DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
       BaseFragments;
-  for (auto *DAI : at::getAssignmentMarkers(OldAlloca))
-    BaseFragments[getAggregateVariable(DAI)] =
-        DAI->getExpression()->getFragmentInfo();
   for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
     BaseFragments[getAggregateVariable(DVR)] =
         DVR->getExpression()->getFragmentInfo();
@@ -391,7 +381,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
   assert(OldAlloca->isStaticAlloca());
 
-  auto MigrateDbgAssign = [&](auto *DbgAssign) {
+  auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
     LLVM_DEBUG(dbgs() << "      existing dbg.assign is: " << *DbgAssign
                       << "\n");
     auto *Expr = DbgAssign->getExpression();
@@ -486,7 +476,6 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
   };
 
-  for_each(MarkerRange, MigrateDbgAssign);
   for_each(DVRAssignMarkerRange, MigrateDbgAssign);
 }
 
@@ -5119,36 +5108,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
 }
 
 // There isn't a shared interface to get the "address" parts out of a
-// dbg.declare and dbg.assign, so provide some wrappers now for
-// both debug intrinsics and records.
-const Value *getAddress(const DbgVariableIntrinsic *DVI) {
-  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-    return DAI->getAddress();
-  return cast<DbgDeclareInst>(DVI)->getAddress();
-}
-
-const Value *getAddress(const DbgVariableRecord *DVR) {
-  return DVR->getAddress();
-}
-
-bool isKillAddress(const DbgVariableIntrinsic *DVI) {
-  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-    return DAI->isKillAddress();
-  return cast<DbgDeclareInst>(DVI)->isKillLocation();
-}
-
+// dbg.declare and dbg.assign, so provide some wrappers.
 bool isKillAddress(const DbgVariableRecord *DVR) {
   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
     return DVR->isKillAddress();
   return DVR->isKillLocation();
 }
 
-const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) {
-  if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-    return DAI->getAddressExpression();
-  return cast<DbgDeclareInst>(DVI)->getExpression();
-}
-
 const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
     return DVR->getAddressExpression();
@@ -5236,66 +5202,6 @@ static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
   return DIExpression::get(Expr->getContext(), Ops);
 }
 
-/// Insert a new dbg.declare.
-/// \p Orig Original to copy debug loc and variable from.
-/// \p NewAddr Location's new base address.
-/// \p NewAddrExpr New expression to apply to address.
-/// \p BeforeInst Insert position.
-/// \p NewFragment New fragment (absolute, non-relative).
-/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
-static void
-insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr,
-                 DIExpression *NewAddrExpr, Instruction *BeforeInst,
-                 std::optional<DIExpression::FragmentInfo> NewFragment,
-                 int64_t BitExtractAdjustment) {
-  if (NewFragment)
-    NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment,
-                                          BitExtractAdjustment);
-  if (!NewAddrExpr)
-    return;
-
-  DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr,
-                    Orig->getDebugLoc(), BeforeInst->getIterator());
-}
-
-/// Insert a new dbg.assign.
-/// \p Orig Original to copy debug loc, variable, value and value expression
-///    from.
-/// \p NewAddr Location's new base address.
-/// \p NewAddrExpr New expression to apply to address.
-/// \p BeforeInst Insert position.
-/// \p NewFragment New fragment (absolute, non-relative).
-/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
-static void
-insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr,
-                 DIExpression *NewAddrExpr, Instruction *BeforeInst,
-                 std::optional<DIExpression::FragmentInfo> NewFragment,
-                 int64_t BitExtractAdjustment) {
-  // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr.
-  (void)BeforeInst;
-
-  // A dbg.assign puts fragment info in the value expression only. The address
-  // expression has already been built: NewAddrExpr.
-  DIExpression *NewFragmentExpr = Orig->getExpression();
-  if (NewFragment)
-    NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
-                                              BitExtractAdjustment);
-  if (!NewFragmentExpr)
-    return;
-
-  // Apply a DIAssignID to the store if it doesn't already have it.
-  if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
-    NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
-                         DIAssignID::getDistinct(NewAddr->getContext()));
-  }
-
-  Instruction *NewAssign = cast<Instruction *>(DIB.insertDbgAssign(
-      NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
-      NewAddrExpr, Orig->getDebugLoc()));
-  LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
-  (void)NewAssign;
-}
-
 /// Insert a new DbgRecord.
 /// \p Orig Original to copy record type, debug loc and variable from, and
 ///    additionally value and value expression for dbg_assign records.
@@ -5457,12 +5363,12 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
-  auto MigrateOne = [&](auto *DbgVariable) {
+  auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
     // Can't overlap with undef memory.
     if (isKillAddress(DbgVariable))
       return;
 
-    const Value *DbgPtr = getAddress(DbgVariable);
+    const Value *DbgPtr = DbgVariable->getAddress();
     DIExpression::FragmentInfo VarFrag =
         DbgVariable->getFragmentOrEntireVariable();
     // Get the address expression constant offset if one exists and the ops
@@ -5543,7 +5449,6 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         if (SameVariableFragment(OldDII, DbgVariable))
           OldDII->eraseFromParent();
       };
-      for_each(findDbgDeclares(Fragment.Alloca), RemoveOne);
       for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
       for_each(findDVRValues(Fragment.Alloca), RemoveOne);
       insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
@@ -5553,10 +5458,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
-  for_each(findDbgDeclares(&AI), MigrateOne);
   for_each(findDVRDeclares(&AI), MigrateOne);
   for_each(findDVRValues(&AI), MigrateOne);
-  for_each(at::getAssignmentMarkers(&AI), MigrateOne);
   for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
 
   return Changed;
@@ -5777,8 +5680,6 @@ bool SROA::deleteDeadInstructions(
     // not be able to find it.
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
       DeletedAllocas.insert(AI);
-      for (DbgDeclareInst *OldDII : findDbgDeclares(AI))
-        OldDII->eraseFromParent();
       for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
         OldDII->eraseFromParent();
     }
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index a09303bb4469f..60e5df08c6efd 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -194,8 +194,7 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F,
     // Calls to experimental_deoptimize must be followed by a return
     // of the value computed by experimental_deoptimize.
     // I.e., we can not change `ret` to `br` for this block.
-    if (auto *CI =
-            dyn_cast_or_null<CallInst>(Term->getPrevNonDebugInstruction())) {
+    if (auto *CI = dyn_cast_or_null<CallInst>(Term->getPrevNode())) {
       if (Function *F = CI->getCalledFunction())
         if (Intrinsic::ID ID = F->getIntrinsicID())
           if (ID == Intrinsic::experimental_deoptimize)
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index f053e202655be..ebcbd2bf87a87 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -66,7 +66,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index fccb73a36b182..b187208bc238c 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -576,9 +576,8 @@ void PruningFunctionCloner::CloneBlock(
     }
 
     // Eagerly remap operands to the newly cloned instruction, except for PHI
-    // nodes for which we defer processing until we update the CFG. Also defer
-    // debug intrinsic processing because they may contain use-before-defs.
-    if (!isa<PHINode>(NewInst) && !isa<DbgVariableIntrinsic>(NewInst)) {
+    // nodes for which we defer processing until we update the CFG.
+    if (!isa<PHINode>(NewInst)) {
       RemapInstruction(NewInst, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
 
@@ -733,15 +732,6 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     StartingInst = &StartingBB->front();
   }
 
-  // Collect debug intrinsics for remapping later.
-  SmallVector<const DbgVariableIntrinsic *, 8> DbgIntrinsics;
-  for (const auto &BB : *OldFunc) {
-    for (const auto &I : BB) {
-      if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-        DbgIntrinsics.push_back(DVI);
-    }
-  }
-
   // Clone the entry block, and anything recursively reachable from it.
   std::vector<const BasicBlock *> CloneWorklist;
   PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
@@ -899,21 +889,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // Restore attributes.
   NewFunc->setAttributes(Attrs);
 
-  // Remap debug intrinsic operands now that all values have been mapped.
-  // Doing this now (late) preserves use-before-defs in debug intrinsics. If
+  // Remap debug records operands now that all values have been mapped.
+  // Doing this now (late) preserves use-before-defs in debug records. If
   // we didn't do this, ValueAsMetadata(use-before-def) operands would be
   // replaced by empty metadata. This would signal later cleanup passes to
-  // remove the debug intrinsics, potentially causing incorrect locations.
-  for (const auto *DVI : DbgIntrinsics) {
-    if (DbgVariableIntrinsic *NewDVI =
-            cast_or_null<DbgVariableIntrinsic>(VMap.lookup(DVI)))
-      RemapInstruction(NewDVI, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                       TypeMapper, Materializer);
-  }
-
-  // Do the same for DbgVariableRecords, touching all the instructions in the
-  // cloned range of blocks.
+  // remove the debug records, potentially causing incorrect locations.
   Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
   for (BasicBlock &BB : make_range(Begin, NewFunc->end())) {
     for (Instruction &I : BB) {
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index eacaf42e4e8ba..1d1af42153325 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1222,9 +1222,7 @@ static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
     SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
     SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
     findDbgUsers(DbgUsers, &I, &DbgVariableRecords);
-    for (DbgVariableIntrinsic *DVI : DbgUsers)
-      if (DVI->getFunction() != &F)
-        DVI->eraseFromParent();
+    assert(DbgUsers.empty());
     for (DbgVariableRecord *DVR : DbgVariableRecords)
       if (DVR->getFunction() != &F)
         DVR->eraseFromParent();
@@ -1289,14 +1287,12 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
     SmallVector<DbgVariableRecord *, 1> DPUsers;
     findDbgUsers(DbgUsers, Input, &DPUsers);
+    assert(DbgUsers.empty());
     DIExpression *Expr = DIB.createExpression();
 
     // Iterate the debud users of the Input values. If they are in the extracted
     // function then update their location with the new value. If they are in
     // the parent function then create a similar debug record.
-    for (auto *DVI : DbgUsers)
-      UpdateOrInsertDebugRecord(DVI, Input, NewVal, Expr,
-                                isa<DbgDeclareInst>(DVI));
     for (auto *DVR : DPUsers)
       UpdateOrInsertDebugRecord(DVR, Input, NewVal, Expr, DVR->isDbgDeclare());
   }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 6929d14bc56ea..ed3dca2f7c307 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1978,14 +1978,13 @@ static at::StorageToVarsMap collectEscapedLocals(const DataLayout &DL,
       continue;
 
     // Find all local variables associated with the backing storage.
-    auto CollectAssignsForStorage = [&](auto *DbgAssign) {
+    auto CollectAssignsForStorage = [&](DbgVariableRecord *DbgAssign) {
       // Skip variables from inlined functions - they are not local variables.
       if (DbgAssign->getDebugLoc().getInlinedAt())
         return;
       LLVM_DEBUG(errs() << " > DEF : " << *DbgAssign << "\n");
       EscapedLocals[Base].insert(at::VarRecord(DbgAssign));
     };
-    for_each(at::getAssignmentMarkers(Base), CollectAssignsForStorage);
     for_each(at::getDVRAssignmentMarkers(Base), CollectAssignsForStorage);
   }
   return EscapedLocals;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 81d85375b9e1d..ee3e56c3c6db9 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -428,10 +428,6 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I,
   if (I->isEHPad())
     return false;
 
-  // We don't want debug info removed by anything this general.
-  if (isa<DbgVariableIntrinsic>(I))
-    return false;
-
   if (const DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
     if (DLI->getLabel())
       return false;
@@ -617,11 +613,10 @@ bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, I, &DPUsers);
-  for (auto *DII : DbgUsers)
-    DII->setKillLocation();
+  assert(DbgUsers.empty());
   for (auto *DVR : DPUsers)
     DVR->setKillLocation();
-  return !DbgUsers.empty() || !DPUsers.empty();
+  return !DPUsers.empty();
 }
 
 /// areAllUsesEqual - Check whether the uses of a value are all the same.
@@ -1632,33 +1627,6 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
 /// describes an alloca'd variable, so we need to use the alloc size of the
 /// value when doing the comparison. E.g. an i1 value will be identified as
 /// covering an n-bit fragment, if the store size of i1 is at least n bits.
-static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
-  const DataLayout &DL = DII->getDataLayout();
-  TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
-  if (std::optional<uint64_t> FragmentSize =
-          DII->getExpression()->getActiveBits(DII->getVariable()))
-    return TypeSize::isKnownGE(ValueSize, TypeSize::getFixed(*FragmentSize));
-
-  // We can't always calculate the size of the DI variable (e.g. if it is a
-  // VLA). Try to use the size of the alloca that the dbg intrinsic describes
-  // instead.
-  if (DII->isAddressOfVariable()) {
-    // DII should have exactly 1 location when it is an address.
-    assert(DII->getNumVariableLocationOps() == 1 &&
-           "address of variable must have exactly 1 location operand.");
-    if (auto *AI =
-            dyn_cast_or_null<AllocaInst>(DII->getVariableLocationOp(0))) {
-      if (std::optional<TypeSize> FragmentSize =
-              AI->getAllocationSizeInBits(DL)) {
-        return TypeSize::isKnownGE(ValueSize, *FragmentSize);
-      }
-    }
-  }
-  // Could not determine size of variable. Conservatively return false.
-  return false;
-}
-// RemoveDIs: duplicate implementation of the above, using DbgVariableRecords,
-// the replacement for dbg.values.
 static bool valueCoversEntireFragment(Type *ValTy, DbgVariableRecord *DVR) {
   const DataLayout &DL = DVR->getModule()->getDataLayout();
   TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
@@ -1695,106 +1663,12 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV,
   Instr->getParent()->insertDbgRecordBefore(DVRec, Instr);
 }
 
-static void insertDbgValueOrDbgVariableRecordAfter(
-    DIBuilder &Builder, Value *DV, DILocalVariable *DIVar, DIExpression *DIExpr,
-    const DebugLoc &NewLoc, Instruction *Instr) {
-  BasicBlock::iterator NextIt = std::next(Instr->getIterator());
-  NextIt.setHeadBit(true);
-  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, NextIt);
-}
-
-/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                           StoreInst *SI, DIBuilder &Builder) {
-  assert(DII->isAddressOfVariable() || isa<DbgAssignIntrinsic>(DII));
-  auto *DIVar = DII->getVariable();
-  assert(DIVar && "Missing variable");
-  auto *DIExpr = DII->getExpression();
-  Value *DV = SI->getValueOperand();
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  // If the alloca describes the variable itself, i.e. the expression in the
-  // dbg.declare doesn't start with a dereference, we can perform the
-  // conversion if the value covers the entire fragment of DII.
-  // If the alloca describes the *address* of DIVar, i.e. DIExpr is
-  // *just* a DW_OP_deref, we use DV as is for the dbg.value.
-  // We conservatively ignore other dereferences, because the following two are
-  // not equivalent:
-  //     dbg.declare(alloca, ..., !Expr(deref, plus_uconstant, 2))
-  //     dbg.value(DV, ..., !Expr(deref, plus_uconstant, 2))
-  // The former is adding 2 to the address of the variable, whereas the latter
-  // is adding 2 to the value of the variable. As such, we insist on just a
-  // deref expression.
-  bool CanConvert =
-      DIExpr->isDeref() || (!DIExpr->startsWithDeref() &&
-                            valueCoversEntireFragment(DV->getType(), DII));
-  if (CanConvert) {
-    insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
-                                      SI->getIterator());
-    return;
-  }
-
-  // FIXME: If storing to a part of the variable described by the dbg.declare,
-  // then we want to insert a dbg.value for the corresponding fragment.
-  LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DII
-                    << '\n');
-  // For now, when there is a store to parts of the variable (but we do not
-  // know which part) we insert an dbg.value intrinsic to indicate that we
-  // know nothing about the variable's content.
-  DV = PoisonValue::get(DV->getType());
-  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
-                                    SI->getIterator());
-}
-
 static DIExpression *dropInitialDeref(const DIExpression *DIExpr) {
   int NumEltDropped = DIExpr->getElements()[0] == dwarf::DW_OP_LLVM_arg ? 3 : 1;
   return DIExpression::get(DIExpr->getContext(),
                            DIExpr->getElements().drop_front(NumEltDropped));
 }
 
-void llvm::InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII, StoreInst *SI,
-                                      DIBuilder &Builder) {
-  auto *DIVar = DII->getVariable();
-  assert(DIVar && "Missing variable");
-  auto *DIExpr = DII->getExpression();
-  DIExpr = dropInitialDeref(DIExpr);
-  Value *DV = SI->getValueOperand();
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
-                                    SI->getIterator());
-}
-
-/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
-/// that has an associated llvm.dbg.declare intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                           LoadInst *LI, DIBuilder &Builder) {
-  auto *DIVar = DII->getVariable();
-  auto *DIExpr = DII->getExpression();
-  assert(DIVar && "Missing variable");
-
-  if (!valueCoversEntireFragment(LI->getType(), DII)) {
-    // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a dbg.value for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
-                      << *DII << '\n');
-    return;
-  }
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  // We are now tracking the loaded value instead of the address. In the
-  // future if multi-location support is added to the IR, it might be
-  // preferable to keep tracking both the loaded value and the original
-  // address in case the alloca can not be elided.
-  insertDbgValueOrDbgVariableRecordAfter(Builder, LI, DIVar, DIExpr, NewLoc,
-                                         LI);
-}
-
 void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
                                            StoreInst *SI, DIBuilder &Builder) {
   assert(DVR->isAddressOfVariable() || DVR->isDbgAssign());
@@ -1855,40 +1729,6 @@ void llvm::InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI,
                                     SI->getIterator());
 }
 
-/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
-/// llvm.dbg.declare intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
-                                           PHINode *APN, DIBuilder &Builder) {
-  auto *DIVar = DII->getVariable();
-  auto *DIExpr = DII->getExpression();
-  assert(DIVar && "Missing variable");
-
-  if (PhiHasDebugValue(DIVar, DIExpr, APN))
-    return;
-
-  if (!valueCoversEntireFragment(APN->getType(), DII)) {
-    // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a dbg.value for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
-                      << *DII << '\n');
-    return;
-  }
-
-  BasicBlock *BB = APN->getParent();
-  auto InsertionPt = BB->getFirstInsertionPt();
-
-  DebugLoc NewLoc = getDebugValueLoc(DII);
-
-  // The block may be a catchswitch block, which does not have a valid
-  // insertion point.
-  // FIXME: Insert dbg.value markers in the successors when appropriate.
-  if (InsertionPt != BB->end()) {
-    insertDbgValueOrDbgVariableRecord(Builder, APN, DIVar, DIExpr, NewLoc,
-                                      InsertionPt);
-  }
-}
-
 void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI,
                                            DIBuilder &Builder) {
   auto *DIVar = DVR->getVariable();
@@ -1981,7 +1821,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
   if (Dbgs.empty() && DVRs.empty())
     return Changed;
 
-  auto LowerOne = [&](auto *DDI) {
+  auto LowerOne = [&](DbgVariableRecord *DDI) {
     AllocaInst *AI =
         dyn_cast_or_null<AllocaInst>(DDI->getVariableLocationOp(0));
     // If this is an alloca for a scalar variable, insert a dbg.value
@@ -2036,7 +1876,6 @@ bool llvm::LowerDbgDeclare(Function &F) {
     Changed = true;
   };
 
-  for_each(Dbgs, LowerOne);
   for_each(DVRs, LowerOne);
 
   if (Changed)
@@ -2046,12 +1885,9 @@ bool llvm::LowerDbgDeclare(Function &F) {
   return Changed;
 }
 
-// RemoveDIs: re-implementation of insertDebugValuesForPHIs, but which pulls the
-// debug-info out of the block's DbgVariableRecords rather than dbg.value
-// intrinsics.
-static void
-insertDbgVariableRecordsForPHIs(BasicBlock *BB,
-                                SmallVectorImpl<PHINode *> &InsertedPHIs) {
+/// Propagate dbg.value records through the newly inserted PHIs.
+void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
+                                    SmallVectorImpl<PHINode *> &InsertedPHIs) {
   assert(BB && "No BasicBlock to clone DbgVariableRecord(s) from.");
   if (InsertedPHIs.size() == 0)
     return;
@@ -2113,76 +1949,12 @@ insertDbgVariableRecordsForPHIs(BasicBlock *BB,
   }
 }
 
-/// Propagate dbg.value intrinsics through the newly inserted PHIs.
-void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
-                                    SmallVectorImpl<PHINode *> &InsertedPHIs) {
-  assert(BB && "No BasicBlock to clone dbg.value(s) from.");
-  if (InsertedPHIs.size() == 0)
-    return;
-
-  insertDbgVariableRecordsForPHIs(BB, InsertedPHIs);
-
-  // Map existing PHI nodes to their dbg.values.
-  ValueToValueMapTy DbgValueMap;
-  for (auto &I : *BB) {
-    if (auto DbgII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-      for (Value *V : DbgII->location_ops())
-        if (auto *Loc = dyn_cast_or_null<PHINode>(V))
-          DbgValueMap.insert({Loc, DbgII});
-    }
-  }
-  if (DbgValueMap.size() == 0)
-    return;
-
-  // Map a pair of the destination BB and old dbg.value to the new dbg.value,
-  // so that if a dbg.value is being rewritten to use more than one of the
-  // inserted PHIs in the same destination BB, we can update the same dbg.value
-  // with all the new PHIs instead of creating one copy for each.
-  MapVector<std::pair<BasicBlock *, DbgVariableIntrinsic *>,
-            DbgVariableIntrinsic *>
-      NewDbgValueMap;
-  // Then iterate through the new PHIs and look to see if they use one of the
-  // previously mapped PHIs. If so, create a new dbg.value intrinsic that will
-  // propagate the info through the new PHI. If we use more than one new PHI in
-  // a single destination BB with the same old dbg.value, merge the updates so
-  // that we get a single new dbg.value with all the new PHIs.
-  for (auto *PHI : InsertedPHIs) {
-    BasicBlock *Parent = PHI->getParent();
-    // Avoid inserting an intrinsic into an EH block.
-    if (Parent->getFirstNonPHIIt()->isEHPad())
-      continue;
-    for (auto *VI : PHI->operand_values()) {
-      auto V = DbgValueMap.find(VI);
-      if (V != DbgValueMap.end()) {
-        auto *DbgII = cast<DbgVariableIntrinsic>(V->second);
-        auto [NewDI, Inserted] = NewDbgValueMap.try_emplace({Parent, DbgII});
-        if (Inserted)
-          NewDI->second = cast<DbgVariableIntrinsic>(DbgII->clone());
-        DbgVariableIntrinsic *NewDbgII = NewDI->second;
-        // If PHI contains VI as an operand more than once, we may
-        // replaced it in NewDbgII; confirm that it is present.
-        if (is_contained(NewDbgII->location_ops(), VI))
-          NewDbgII->replaceVariableLocationOp(VI, PHI);
-      }
-    }
-  }
-  // Insert thew new dbg.values into their destination blocks.
-  for (auto DI : NewDbgValueMap) {
-    BasicBlock *Parent = DI.first.first;
-    auto *NewDbgII = DI.second;
-    auto InsertionPt = Parent->getFirstInsertionPt();
-    assert(InsertionPt != Parent->end() && "Ill-formed basic block");
-    NewDbgII->insertBefore(InsertionPt);
-  }
-}
-
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
                              DIBuilder &Builder, uint8_t DIExprFlags,
                              int Offset) {
-  TinyPtrVector<DbgDeclareInst *> DbgDeclares = findDbgDeclares(Address);
   TinyPtrVector<DbgVariableRecord *> DVRDeclares = findDVRDeclares(Address);
 
-  auto ReplaceOne = [&](auto *DII) {
+  auto ReplaceOne = [&](DbgVariableRecord *DII) {
     assert(DII->getVariable() && "Missing variable");
     auto *DIExpr = DII->getExpression();
     DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset);
@@ -2190,10 +1962,9 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
     DII->replaceVariableLocationOp(Address, NewAddress);
   };
 
-  for_each(DbgDeclares, ReplaceOne);
   for_each(DVRDeclares, ReplaceOne);
 
-  return !DbgDeclares.empty() || !DVRDeclares.empty();
+  return !DVRDeclares.empty();
 }
 
 static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
@@ -2250,6 +2021,7 @@ void llvm::salvageDebugInfo(Instruction &I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, &I, &DPUsers);
+  assert(DbgUsers.empty());
   salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers);
 }
 
@@ -2298,66 +2070,9 @@ void llvm::salvageDebugInfoForDbgValues(
   const unsigned MaxExpressionSize = 128;
   bool Salvaged = false;
 
-  for (auto *DII : DbgUsers) {
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DII)) {
-      if (DAI->getAddress() == &I) {
-        salvageDbgAssignAddress(DAI);
-        Salvaged = true;
-      }
-      if (DAI->getValue() != &I)
-        continue;
-    }
-
-    // Do not add DW_OP_stack_value for DbgDeclare, because they are implicitly
-    // pointing out the value as a DWARF memory location description.
-    bool StackValue = isa<DbgValueInst>(DII);
-    auto DIILocation = DII->location_ops();
-    assert(
-        is_contained(DIILocation, &I) &&
-        "DbgVariableIntrinsic must use salvaged instruction as its location");
-    SmallVector<Value *, 4> AdditionalValues;
-    // `I` may appear more than once in DII's location ops, and each use of `I`
-    // must be updated in the DIExpression and potentially have additional
-    // values added; thus we call salvageDebugInfoImpl for each `I` instance in
-    // DIILocation.
-    Value *Op0 = nullptr;
-    DIExpression *SalvagedExpr = DII->getExpression();
-    auto LocItr = find(DIILocation, &I);
-    while (SalvagedExpr && LocItr != DIILocation.end()) {
-      SmallVector<uint64_t, 16> Ops;
-      unsigned LocNo = std::distance(DIILocation.begin(), LocItr);
-      uint64_t CurrentLocOps = SalvagedExpr->getNumLocationOperands();
-      Op0 = salvageDebugInfoImpl(I, CurrentLocOps, Ops, AdditionalValues);
-      if (!Op0)
-        break;
-      SalvagedExpr =
-          DIExpression::appendOpsToArg(SalvagedExpr, Ops, LocNo, StackValue);
-      LocItr = std::find(++LocItr, DIILocation.end(), &I);
-    }
-    // salvageDebugInfoImpl should fail on examining the first element of
-    // DbgUsers, or none of them.
-    if (!Op0)
-      break;
+  // We should never see debug intrinsics nowadays.
+  assert(DbgUsers.empty());
 
-    SalvagedExpr = SalvagedExpr->foldConstantMath();
-    DII->replaceVariableLocationOp(&I, Op0);
-    bool IsValidSalvageExpr = SalvagedExpr->getNumElements() <= MaxExpressionSize;
-    if (AdditionalValues.empty() && IsValidSalvageExpr) {
-      DII->setExpression(SalvagedExpr);
-    } else if (isa<DbgValueInst>(DII) && IsValidSalvageExpr &&
-               DII->getNumVariableLocationOps() + AdditionalValues.size() <=
-                   MaxDebugArgs) {
-      DII->addVariableLocationOps(AdditionalValues, SalvagedExpr);
-    } else {
-      // Do not salvage using DIArgList for dbg.declare, as it is not currently
-      // supported in those instructions. Also do not salvage if the resulting
-      // DIArgList would contain an unreasonably large number of values.
-      DII->setKillLocation();
-    }
-    LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
-    Salvaged = true;
-  }
-  // Duplicate of above block for DbgVariableRecords.
   for (auto *DVR : DPUsers) {
     if (DVR->isDbgAssign()) {
       if (DVR->getAddress() == &I) {
@@ -2426,9 +2141,6 @@ void llvm::salvageDebugInfoForDbgValues(
   if (Salvaged)
     return;
 
-  for (auto *DII : DbgUsers)
-    DII->setKillLocation();
-
   for (auto *DVR : DPUsers)
     DVR->setKillLocation();
 }
@@ -2645,7 +2357,6 @@ using DbgValReplacement = std::optional<DIExpression *>;
 /// changes are made.
 static bool rewriteDebugUsers(
     Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
-    function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr,
     function_ref<DbgValReplacement(DbgVariableRecord &DVR)> RewriteDVRExpr) {
   // Find debug users of From.
   SmallVector<DbgVariableIntrinsic *, 1> Users;
@@ -2654,43 +2365,32 @@ static bool rewriteDebugUsers(
   if (Users.empty() && DPUsers.empty())
     return false;
 
+  // Ignore intrinsic-users: they are no longer supported and should never
+  // appear.
+  assert(Users.empty());
+
   // Prevent use-before-def of To.
   bool Changed = false;
 
-  SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
   SmallPtrSet<DbgVariableRecord *, 1> UndefOrSalvageDVR;
   if (isa<Instruction>(&To)) {
-    bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
-
-    for (auto *DII : Users) {
-      // It's common to see a debug user between From and DomPoint. Move it
-      // after DomPoint to preserve the variable update without any reordering.
-      if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) {
-        LLVM_DEBUG(dbgs() << "MOVE:  " << *DII << '\n');
-        DII->moveAfter(&DomPoint);
-        Changed = true;
-
-      // Users which otherwise aren't dominated by the replacement value must
-      // be salvaged or deleted.
-      } else if (!DT.dominates(&DomPoint, DII)) {
-        UndefOrSalvage.insert(DII);
-      }
-    }
+    bool DomPointAfterFrom = From.getNextNode() == &DomPoint;
 
     // DbgVariableRecord implementation of the above.
     for (auto *DVR : DPUsers) {
       Instruction *MarkedInstr = DVR->getMarker()->MarkedInstr;
       Instruction *NextNonDebug = MarkedInstr;
-      // The next instruction might still be a dbg.declare, skip over it.
-      if (isa<DbgVariableIntrinsic>(NextNonDebug))
-        NextNonDebug = NextNonDebug->getNextNonDebugInstruction();
 
+      // It's common to see a debug user between From and DomPoint. Move it
+      // after DomPoint to preserve the variable update without any reordering.
       if (DomPointAfterFrom && NextNonDebug == &DomPoint) {
         LLVM_DEBUG(dbgs() << "MOVE:  " << *DVR << '\n');
         DVR->removeFromParent();
-        // Ensure there's a marker.
         DomPoint.getParent()->insertDbgRecordAfter(DVR, &DomPoint);
         Changed = true;
+
+      // Users which otherwise aren't dominated by the replacement value must
+      // be salvaged or deleted.
       } else if (!DT.dominates(&DomPoint, MarkedInstr)) {
         UndefOrSalvageDVR.insert(DVR);
       }
@@ -2698,19 +2398,6 @@ static bool rewriteDebugUsers(
   }
 
   // Update debug users without use-before-def risk.
-  for (auto *DII : Users) {
-    if (UndefOrSalvage.count(DII))
-      continue;
-
-    DbgValReplacement DVRepl = RewriteExpr(*DII);
-    if (!DVRepl)
-      continue;
-
-    DII->replaceVariableLocationOp(&From, &To);
-    DII->setExpression(*DVRepl);
-    LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n');
-    Changed = true;
-  }
   for (auto *DVR : DPUsers) {
     if (UndefOrSalvageDVR.count(DVR))
       continue;
@@ -2725,7 +2412,7 @@ static bool rewriteDebugUsers(
     Changed = true;
   }
 
-  if (!UndefOrSalvage.empty() || !UndefOrSalvageDVR.empty()) {
+  if (!UndefOrSalvageDVR.empty()) {
     // Try to salvage the remaining debug users.
     salvageDebugInfo(From);
     Changed = true;
@@ -2770,9 +2457,6 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
   Type *FromTy = From.getType();
   Type *ToTy = To.getType();
 
-  auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
-    return DII.getExpression();
-  };
   auto IdentityDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
     return DVR.getExpression();
   };
@@ -2781,7 +2465,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
   Module &M = *From.getModule();
   const DataLayout &DL = M.getDataLayout();
   if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
-    return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
+    return rewriteDebugUsers(From, To, DomPoint, DT, IdentityDVR);
 
   // Handle integer-to-integer widening and narrowing.
   // FIXME: Use DW_OP_convert when it's available everywhere.
@@ -2793,24 +2477,10 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
     // When the width of the result grows, assume that a debugger will only
     // access the low `FromBits` bits when inspecting the source variable.
     if (FromBits < ToBits)
-      return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
+      return rewriteDebugUsers(From, To, DomPoint, DT, IdentityDVR);
 
     // The width of the result has shrunk. Use sign/zero extension to describe
     // the source variable's high bits.
-    auto SignOrZeroExt = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
-      DILocalVariable *Var = DII.getVariable();
-
-      // Without knowing signedness, sign/zero extension isn't possible.
-      auto Signedness = Var->getSignedness();
-      if (!Signedness)
-        return std::nullopt;
-
-      bool Signed = *Signedness == DIBasicType::Signedness::Signed;
-      return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
-                                     Signed);
-    };
-    // RemoveDIs: duplicate implementation working on DbgVariableRecords rather
-    // than on dbg.value intrinsics.
     auto SignOrZeroExtDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
       DILocalVariable *Var = DVR.getVariable();
 
@@ -2823,8 +2493,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
       return DIExpression::appendExt(DVR.getExpression(), ToBits, FromBits,
                                      Signed);
     };
-    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt,
-                             SignOrZeroExtDVR);
+    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExtDVR);
   }
 
   // TODO: Floating-point conversions, vectors.
@@ -3070,9 +2739,9 @@ static bool markAliveBlocks(Function &F,
           // If we found a call to a no-return function, insert an unreachable
           // instruction after it.  Make sure there isn't *already* one there
           // though.
-          if (!isa<UnreachableInst>(CI->getNextNonDebugInstruction())) {
+          if (!isa<UnreachableInst>(CI->getNextNode())) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(CI->getNextNonDebugInstruction(), false, DTU);
+            changeToUnreachable(CI->getNextNode(), false, DTU);
             Changed = true;
           }
           break;
@@ -3700,8 +3369,7 @@ void llvm::dropDebugUsers(Instruction &I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, &I, &DPUsers);
-  for (auto *DII : DbgUsers)
-    DII->eraseFromParent();
+  assert(DbgUsers.empty());
   for (auto *DVR : DPUsers)
     DVR->eraseFromParent();
 }
@@ -3800,10 +3468,6 @@ void llvm::remapDebugVariable(ValueToValueMapTy &Mapping, Instruction *Inst) {
     if (I != Mapping.end())
       DA->setAddress(I->second);
   };
-  if (auto DVI = dyn_cast<DbgVariableIntrinsic>(Inst))
-    RemapDebugOperands(DVI, DVI->location_ops());
-  if (auto DAI = dyn_cast<DbgAssignIntrinsic>(Inst))
-    RemapAssignAddress(DAI);
   for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange())) {
     RemapDebugOperands(&DVR, DVR.location_ops());
     if (DVR.isDbgAssign())
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 511c15555fa83..6226596017980 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -168,22 +168,6 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE,
       Info.AllocasToInstrument[AI].LifetimeEnd.push_back(II);
     return;
   }
-  if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    auto AddIfInteresting = [&](Value *V) {
-      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
-        if (getAllocaInterestingness(*AI) !=
-            AllocaInterestingness::kInteresting)
-          return;
-        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
-        auto &DVIVec = AInfo.DbgVariableIntrinsics;
-        if (DVIVec.empty() || DVIVec.back() != DVI)
-          DVIVec.push_back(DVI);
-      }
-    };
-    for_each(DVI->location_ops(), AddIfInteresting);
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-      AddIfInteresting(DAI->getAddress());
-  }
 
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
@@ -297,19 +281,12 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) {
                                 IRB.CreateCall(ThreadPointerFunc), 8 * Slot);
 }
 
-static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
-  return dyn_cast<DbgAssignIntrinsic>(DVI);
-}
-
 static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
   return DVR->isDbgAssign() ? DVR : nullptr;
 }
 
 void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) {
-  // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
-  // abstracted over whether they're intrinsic-stored or DbgVariableRecord
-  // stored.
-  auto AnnotateDbgRecord = [&](auto *DPtr) {
+  auto AnnotateDbgRecord = [&](DbgVariableRecord *DPtr) {
     // Prepend "tag_offset, N" to the dwarf expression.
     // Tag offset logically applies to the alloca pointer, and it makes sense
     // to put it at the beginning of the expression.
@@ -325,7 +302,6 @@ void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) {
     }
   };
 
-  llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
   llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord);
 }
 
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 46808a818cb26..73b5f48796b7a 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -115,29 +115,17 @@ static void createDebugValue(DIBuilder &DIB, Value *NewValue,
   DbgVariableRecord::createDbgVariableRecord(NewValue, Variable, Expression, DI,
                                              *InsertBefore);
 }
-static void createDebugValue(DIBuilder &DIB, Value *NewValue,
-                             DILocalVariable *Variable,
-                             DIExpression *Expression, const DILocation *DI,
-                             Instruction *InsertBefore) {
-  DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI,
-                              InsertBefore->getIterator());
-}
 
 /// Helper for updating assignment tracking debug info when promoting allocas.
 class AssignmentTrackingInfo {
   /// DbgAssignIntrinsics linked to the alloca with at most one per variable
   /// fragment. (i.e. not be a comprehensive set if there are multiple
   /// dbg.assigns for one variable fragment).
-  SmallVector<DbgVariableIntrinsic *> DbgAssigns;
   SmallVector<DbgVariableRecord *> DVRAssigns;
 
 public:
   void init(AllocaInst *AI) {
     SmallSet<DebugVariable, 2> Vars;
-    for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(AI)) {
-      if (Vars.insert(DebugVariable(DAI)).second)
-        DbgAssigns.push_back(DAI);
-    }
     for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(AI)) {
       if (Vars.insert(DebugVariable(DVR)).second)
         DVRAssigns.push_back(DVR);
@@ -148,11 +136,10 @@ class AssignmentTrackingInfo {
   /// \p ToDelete that stores to this alloca.
   void updateForDeletedStore(
       StoreInst *ToDelete, DIBuilder &DIB,
-      SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
       SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
     // There's nothing to do if the alloca doesn't have any variables using
     // assignment tracking.
-    if (DbgAssigns.empty() && DVRAssigns.empty())
+    if (DVRAssigns.empty())
       return;
 
     // Insert a dbg.value where the linked dbg.assign is and remember to delete
@@ -169,25 +156,22 @@ class AssignmentTrackingInfo {
                        DbgAssign->getExpression(), DbgAssign->getDebugLoc(),
                        DbgAssign);
     };
-    for (auto *Assign : at::getAssignmentMarkers(ToDelete))
-      InsertValueForAssign(Assign, DbgAssignsToDelete);
     for (auto *Assign : at::getDVRAssignmentMarkers(ToDelete))
       InsertValueForAssign(Assign, DVRAssignsToDelete);
 
     // It's possible for variables using assignment tracking to have no
-    // dbg.assign linked to this store. These are variables in DbgAssigns that
+    // dbg.assign linked to this store. These are variables in DVRAssigns that
     // are missing from VarHasDbgAssignForStore. Since there isn't a dbg.assign
     // to mark the assignment - and the store is going to be deleted - insert a
     // dbg.value to do that now. An untracked store may be either one that
     // cannot be represented using assignment tracking (non-const offset or
     // size) or one that is trackable but has had its DIAssignID attachment
     // dropped accidentally.
-    auto ConvertUnlinkedAssignToValue = [&](auto *Assign) {
+    auto ConvertUnlinkedAssignToValue = [&](DbgVariableRecord *Assign) {
       if (VarHasDbgAssignForStore.contains(DebugVariableAggregate(Assign)))
         return;
       ConvertDebugDeclareToDebugValue(Assign, ToDelete, DIB);
     };
-    for_each(DbgAssigns, ConvertUnlinkedAssignToValue);
     for_each(DVRAssigns, ConvertUnlinkedAssignToValue);
   }
 
@@ -197,21 +181,15 @@ class AssignmentTrackingInfo {
     // Regardless of the position of dbg.assigns relative to stores, the
     // incoming values into a new PHI should be the same for the (imaginary)
     // debug-phi.
-    for (auto *DAI : DbgAssigns)
-      ConvertDebugDeclareToDebugValue(DAI, NewPhi, DIB);
     for (auto *DVR : DVRAssigns)
       ConvertDebugDeclareToDebugValue(DVR, NewPhi, DIB);
   }
 
-  void clear() {
-    DbgAssigns.clear();
-    DVRAssigns.clear();
-  }
-  bool empty() { return DbgAssigns.empty() && DVRAssigns.empty(); }
+  void clear() { DVRAssigns.clear(); }
+  bool empty() { return DVRAssigns.empty(); }
 };
 
 struct AllocaInfo {
-  using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>;
   using DPUserVec = SmallVector<DbgVariableRecord *, 1>;
 
   SmallVector<BasicBlock *, 32> DefiningBlocks;
@@ -222,7 +200,6 @@ struct AllocaInfo {
   bool OnlyUsedInOneBlock;
 
   /// Debug users of the alloca - does not include dbg.assign intrinsics.
-  DbgUserVec DbgUsers;
   DPUserVec DPUsers;
   /// Helper to update assignment tracking debug info.
   AssignmentTrackingInfo AssignmentTracking;
@@ -233,7 +210,6 @@ struct AllocaInfo {
     OnlyStore = nullptr;
     OnlyBlock = nullptr;
     OnlyUsedInOneBlock = true;
-    DbgUsers.clear();
     DPUsers.clear();
     AssignmentTracking.clear();
   }
@@ -267,13 +243,10 @@ struct AllocaInfo {
           OnlyUsedInOneBlock = false;
       }
     }
-    DbgUserVec AllDbgUsers;
+    SmallVector<DbgVariableIntrinsic *> AllDbgUsers;
     SmallVector<DbgVariableRecord *> AllDPUsers;
     findDbgUsers(AllDbgUsers, AI, &AllDPUsers);
-    std::copy_if(AllDbgUsers.begin(), AllDbgUsers.end(),
-                 std::back_inserter(DbgUsers), [](DbgVariableIntrinsic *DII) {
-                   return !isa<DbgAssignIntrinsic>(DII);
-                 });
+    assert(AllDbgUsers.empty());
     std::copy_if(AllDPUsers.begin(), AllDPUsers.end(),
                  std::back_inserter(DPUsers),
                  [](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); });
@@ -401,10 +374,9 @@ struct PromoteMem2Reg {
   /// to.
   DenseMap<PHINode *, unsigned> PhiToAllocaMap;
 
-  /// For each alloca, we keep track of the dbg.declare intrinsic that
+  /// For each alloca, we keep track of the dbg.declare record that
   /// describes it, if any, so that we can convert it to a dbg.value
-  /// intrinsic if the alloca gets promoted.
-  SmallVector<AllocaInfo::DbgUserVec, 8> AllocaDbgUsers;
+  /// record if the alloca gets promoted.
   SmallVector<AllocaInfo::DPUserVec, 8> AllocaDPUsers;
 
   /// For each alloca, keep an instance of a helper class that gives us an easy
@@ -412,7 +384,6 @@ struct PromoteMem2Reg {
   SmallVector<AssignmentTrackingInfo, 8> AllocaATInfo;
   /// A set of dbg.assigns to delete because they've been demoted to
   /// dbg.values. Call cleanUpDbgAssigns to delete them.
-  SmallSet<DbgAssignIntrinsic *, 8> DbgAssignsToDelete;
   SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
 
   /// The set of basic blocks the renamer has already visited.
@@ -467,9 +438,6 @@ struct PromoteMem2Reg {
 
   /// Delete dbg.assigns that have been demoted to dbg.values.
   void cleanUpDbgAssigns() {
-    for (auto *DAI : DbgAssignsToDelete)
-      DAI->eraseFromParent();
-    DbgAssignsToDelete.clear();
     for (auto *DVR : DVRAssignsToDelete)
       DVR->eraseFromParent();
     DVRAssignsToDelete.clear();
@@ -571,7 +539,6 @@ static bool
 rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
                          const DataLayout &DL, DominatorTree &DT,
                          AssumptionCache *AC,
-                         SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
                          SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   StoreInst *OnlyStore = Info.OnlyStore;
   Value *ReplVal = OnlyStore->getOperand(0);
@@ -637,27 +604,23 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
 
   DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
   // Update assignment tracking info for the store we're going to delete.
-  Info.AssignmentTracking.updateForDeletedStore(
-      Info.OnlyStore, DIB, DbgAssignsToDelete, DVRAssignsToDelete);
+  Info.AssignmentTracking.updateForDeletedStore(Info.OnlyStore, DIB,
+                                                DVRAssignsToDelete);
 
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
-  auto ConvertDebugInfoForStore = [&](auto &Container) {
-    for (auto *DbgItem : Container) {
-      if (DbgItem->isAddressOfVariable()) {
-        ConvertDebugDeclareToDebugValue(DbgItem, Info.OnlyStore, DIB);
-        DbgItem->eraseFromParent();
-      } else if (DbgItem->isValueOfVariable() &&
-                 DbgItem->getExpression()->startsWithDeref()) {
-        InsertDebugValueAtStoreLoc(DbgItem, Info.OnlyStore, DIB);
-        DbgItem->eraseFromParent();
-      } else if (DbgItem->getExpression()->startsWithDeref()) {
-        DbgItem->eraseFromParent();
-      }
+  for (DbgVariableRecord *DbgItem : Info.DPUsers) {
+    if (DbgItem->isAddressOfVariable()) {
+      ConvertDebugDeclareToDebugValue(DbgItem, Info.OnlyStore, DIB);
+      DbgItem->eraseFromParent();
+    } else if (DbgItem->isValueOfVariable() &&
+               DbgItem->getExpression()->startsWithDeref()) {
+      InsertDebugValueAtStoreLoc(DbgItem, Info.OnlyStore, DIB);
+      DbgItem->eraseFromParent();
+    } else if (DbgItem->getExpression()->startsWithDeref()) {
+      DbgItem->eraseFromParent();
     }
-  };
-  ConvertDebugInfoForStore(Info.DbgUsers);
-  ConvertDebugInfoForStore(Info.DPUsers);
+  }
 
   // Remove dbg.assigns linked to the alloca as these are now redundant.
   at::deleteAssignmentMarkers(AI);
@@ -690,7 +653,6 @@ static bool
 promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                          LargeBlockInfo &LBI, const DataLayout &DL,
                          DominatorTree &DT, AssumptionCache *AC,
-                         SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
                          SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
@@ -755,18 +717,13 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   while (!AI->use_empty()) {
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Update assignment tracking info for the store we're going to delete.
-    Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DbgAssignsToDelete,
-                                                  DVRAssignsToDelete);
+    Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DVRAssignsToDelete);
     // Record debuginfo for the store before removing it.
-    auto DbgUpdateForStore = [&](auto &Container) {
-      for (auto *DbgItem : Container) {
-        if (DbgItem->isAddressOfVariable()) {
-          ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
-        }
+    for (DbgVariableRecord *DbgItem : Info.DPUsers) {
+      if (DbgItem->isAddressOfVariable()) {
+        ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
       }
-    };
-    DbgUpdateForStore(Info.DbgUsers);
-    DbgUpdateForStore(Info.DPUsers);
+    }
 
     SI->eraseFromParent();
     LBI.deleteValue(SI);
@@ -777,14 +734,11 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   AI->eraseFromParent();
 
   // The alloca's debuginfo can be removed as well.
-  auto DbgUpdateForAlloca = [&](auto &Container) {
-    for (auto *DbgItem : Container)
-      if (DbgItem->isAddressOfVariable() ||
-          DbgItem->getExpression()->startsWithDeref())
-        DbgItem->eraseFromParent();
-  };
-  DbgUpdateForAlloca(Info.DbgUsers);
-  DbgUpdateForAlloca(Info.DPUsers);
+  for (DbgVariableRecord *DbgItem : Info.DPUsers) {
+    if (DbgItem->isAddressOfVariable() ||
+        DbgItem->getExpression()->startsWithDeref())
+      DbgItem->eraseFromParent();
+  }
 
   ++NumLocalPromoted;
   return true;
@@ -793,7 +747,6 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
-  AllocaDbgUsers.resize(Allocas.size());
   AllocaATInfo.resize(Allocas.size());
   AllocaDPUsers.resize(Allocas.size());
 
@@ -830,7 +783,7 @@ void PromoteMem2Reg::run() {
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
       if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                   &DbgAssignsToDelete, &DVRAssignsToDelete)) {
+                                   &DVRAssignsToDelete)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -842,7 +795,7 @@ void PromoteMem2Reg::run() {
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
         promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                 &DbgAssignsToDelete, &DVRAssignsToDelete)) {
+                                 &DVRAssignsToDelete)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -852,9 +805,7 @@ void PromoteMem2Reg::run() {
     if (BBNumPreds.empty())
       BBNumPreds.resize(F.getMaxBlockNumber());
 
-    // Remember the dbg.declare intrinsic describing this alloca, if any.
-    if (!Info.DbgUsers.empty())
-      AllocaDbgUsers[AllocaNum] = Info.DbgUsers;
+    // Remember the dbg.declare record describing this alloca, if any.
     if (!Info.AssignmentTracking.empty())
       AllocaATInfo[AllocaNum] = Info.AssignmentTracking;
     if (!Info.DPUsers.empty())
@@ -930,16 +881,12 @@ void PromoteMem2Reg::run() {
   }
 
   // Remove alloca's dbg.declare intrinsics from the function.
-  auto RemoveDbgDeclares = [&](auto &Container) {
-    for (auto &DbgUsers : Container) {
-      for (auto *DbgItem : DbgUsers)
-        if (DbgItem->isAddressOfVariable() ||
-            DbgItem->getExpression()->startsWithDeref())
-          DbgItem->eraseFromParent();
-    }
-  };
-  RemoveDbgDeclares(AllocaDbgUsers);
-  RemoveDbgDeclares(AllocaDPUsers);
+  for (auto &DbgUsers : AllocaDPUsers) {
+    for (DbgVariableRecord *DbgItem : DbgUsers)
+      if (DbgItem->isAddressOfVariable() ||
+          DbgItem->getExpression()->startsWithDeref())
+        DbgItem->eraseFromParent();
+  }
 
   // Loop over all of the PHI nodes and see if there are any that we can get
   // rid of because they merge all of the same incoming values.  This can
@@ -1182,13 +1129,9 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) {
         // The currently active variable for this block is now the PHI.
         IncomingVals.set(AllocaNo, APN);
         AllocaATInfo[AllocaNo].updateForNewPhi(APN, DIB);
-        auto ConvertDbgDeclares = [&](auto &Container) {
-          for (auto *DbgItem : Container)
-            if (DbgItem->isAddressOfVariable())
-              ConvertDebugDeclareToDebugValue(DbgItem, APN, DIB);
-        };
-        ConvertDbgDeclares(AllocaDbgUsers[AllocaNo]);
-        ConvertDbgDeclares(AllocaDPUsers[AllocaNo]);
+        for (DbgVariableRecord *DbgItem : AllocaDPUsers[AllocaNo])
+          if (DbgItem->isAddressOfVariable())
+            ConvertDebugDeclareToDebugValue(DbgItem, APN, DIB);
 
         // Get the next phi node.
         ++PNI;
@@ -1242,15 +1185,11 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) {
 
       // Record debuginfo for the store before removing it.
       IncomingLocs.set(AllocaNo, SI->getDebugLoc());
-      AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB, &DbgAssignsToDelete,
+      AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB,
                                                    &DVRAssignsToDelete);
-      auto ConvertDbgDeclares = [&](auto &Container) {
-        for (auto *DbgItem : Container)
-          if (DbgItem->isAddressOfVariable())
-            ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
-      };
-      ConvertDbgDeclares(AllocaDbgUsers[ai->second]);
-      ConvertDbgDeclares(AllocaDPUsers[ai->second]);
+      for (DbgVariableRecord *DbgItem : AllocaDPUsers[ai->second])
+        if (DbgItem->isAddressOfVariable())
+          ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB);
       SI->eraseFromParent();
     }
   }
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 24fe08d6c3e4e..ed08c0bfa2e7d 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1223,6 +1223,24 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   return Result;
 }
 
+Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
+  const Loop *L = S->getLoop();
+  BasicBlock *EB = L->getExitBlock();
+  if (!EB || !EB->getSinglePredecessor() ||
+      !SE.DT.dominates(EB, Builder.GetInsertBlock()))
+    return nullptr;
+
+  for (auto &PN : EB->phis()) {
+    if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType())
+      continue;
+    auto *ExitV = SE.getSCEV(&PN);
+    if (S == ExitV)
+      return &PN;
+  }
+
+  return nullptr;
+}
+
 Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // In canonical mode we compute the addrec as an expression of a canonical IV
   // using evaluateAtIteration and expand the resulting SCEV expression. This
@@ -1262,6 +1280,11 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     return V;
   }
 
+  // If S is expanded outside the defining loop, check if there is a
+  // matching LCSSA phi node for it.
+  if (Value *V = tryToReuseLCSSAPhi(S))
+    return V;
+
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
     if (isa<PointerType>(S->getType())) {
@@ -1682,7 +1705,7 @@ void SCEVExpander::replaceCongruentIVInc(
     if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
       IP = PN->getParent()->getFirstInsertionPt();
     else
-      IP = OrigInc->getNextNonDebugInstruction()->getIterator();
+      IP = OrigInc->getNextNode()->getIterator();
 
     IRBuilder<> Builder(IP->getParent(), IP);
     Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index a75f29000ca18..75c96503d556d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -7493,7 +7493,7 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
   SmallPtrSet<PHINode *, 8> Phis;
   SmallPtrSet<BasicBlock *, 8> Seen;
   DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> PhiPredIVs;
-  DenseMap<BasicBlock *, SmallVector<unsigned, 4>> BBToSuccessorIndexes;
+  DenseMap<BasicBlock *, SmallVector<unsigned, 32>> BBToSuccessorIndexes;
   SmallVector<SwitchSuccWrapper> Cases;
   Cases.reserve(SI->getNumSuccessors());
 
@@ -7505,12 +7505,6 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
     if (BB->size() != 1)
       continue;
 
-    // FIXME: This case needs some extra care because the terminators other than
-    // SI need to be updated. For now, consider only backedges to the SI.
-    if (BB->hasNPredecessorsOrMore(4) ||
-        BB->getUniquePredecessor() != SI->getParent())
-      continue;
-
     // FIXME: Relax that the terminator is a BranchInst by checking for equality
     // on other kinds of terminators. We decide to only support unconditional
     // branches for now for compile time reasons.
@@ -7518,14 +7512,24 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
     if (!BI || BI->isConditional())
       continue;
 
-    if (Seen.insert(BB).second) {
-      // Keep track of which PHIs we need as keys in PhiPredIVs below.
-      for (BasicBlock *Succ : BI->successors())
-        Phis.insert_range(llvm::make_pointer_range(Succ->phis()));
-      // Add the successor only if not previously visited.
-      Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs});
+    if (!Seen.insert(BB).second) {
+      auto It = BBToSuccessorIndexes.find(BB);
+      if (It != BBToSuccessorIndexes.end())
+        It->second.emplace_back(I);
+      continue;
     }
 
+    // FIXME: This case needs some extra care because the terminators other than
+    // SI need to be updated. For now, consider only backedges to the SI.
+    if (BB->getUniquePredecessor() != SI->getParent())
+      continue;
+
+    // Keep track of which PHIs we need as keys in PhiPredIVs below.
+    for (BasicBlock *Succ : BI->successors())
+      Phis.insert_range(llvm::make_pointer_range(Succ->phis()));
+
+    // Add the successor only if not previously visited.
+    Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs});
     BBToSuccessorIndexes[BB].emplace_back(I);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5380a0fc6498a..ceeabd65cced3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8112,7 +8112,7 @@ bool VPRecipeBuilder::getScaledReductions(
   std::optional<unsigned> BinOpc;
   Type *ExtOpTypes[2] = {nullptr};
 
-  auto CollectExtInfo = [&Exts,
+  auto CollectExtInfo = [this, &Exts,
                          &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
     unsigned I = 0;
     for (Value *OpI : Ops) {
@@ -8120,6 +8120,11 @@ bool VPRecipeBuilder::getScaledReductions(
       if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
         return false;
       Exts[I] = cast<Instruction>(OpI);
+
+      // TODO: We should be able to support live-ins.
+      if (!CM.TheLoop->contains(Exts[I]))
+        return false;
+
       ExtOpTypes[I] = ExtOp->getType();
       I++;
     }
@@ -8437,56 +8442,11 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
   }
 }
 
-// Collect VPIRInstructions for phis in the exit block from the latch only.
-static SetVector<VPIRInstruction *> collectUsersInLatchExitBlock(VPlan &Plan) {
-  SetVector<VPIRInstruction *> ExitUsersToFix;
-  for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
-
-    if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock())
-      continue;
-
-    for (VPRecipeBase &R : ExitVPBB->phis()) {
-      auto *ExitIRI = cast<VPIRPhi>(&R);
-      assert(ExitIRI->getNumOperands() == 1 && "must have a single operand");
-      VPValue *V = ExitIRI->getOperand(0);
-      if (V->isLiveIn())
-        continue;
-      assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
-             "Only recipes defined inside a region should need fixing.");
-      ExitUsersToFix.insert(ExitIRI);
-    }
-  }
-  return ExitUsersToFix;
-}
-
-// Add exit values to \p Plan. Extracts are added for each entry in \p
-// ExitUsersToFix if needed and their operands are updated.
-static void
-addUsersInExitBlocks(VPlan &Plan,
-                     const SetVector<VPIRInstruction *> &ExitUsersToFix) {
-  if (ExitUsersToFix.empty())
-    return;
-
-  auto *MiddleVPBB = Plan.getMiddleBlock();
-  VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-
-  // Introduce extract for exiting values and update the VPIRInstructions
-  // modeling the corresponding LCSSA phis.
-  for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
-    assert(ExitIRI->getNumOperands() == 1 &&
-           ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
-           "exit values from early exits must be fixed when branch to "
-           "early-exit is added");
-    ExitIRI->extractLastLaneOfFirstOperand(B);
-  }
-}
-
 /// Handle users in the exit block for first order reductions in the original
 /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
 /// users in the original exit block using the VPIRInstruction wrapping to the
 /// LCSSA phi.
-static void addExitUsersForFirstOrderRecurrences(
-    VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
+static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
   auto *ScalarPHVPBB = Plan.getScalarPreheader();
   auto *MiddleVPBB = Plan.getMiddleBlock();
@@ -8575,14 +8535,15 @@ static void addExitUsersForFirstOrderRecurrences(
     // Now update VPIRInstructions modeling LCSSA phis in the exit block.
     // Extract the penultimate value of the recurrence and use it as operand for
     // the VPIRInstruction modeling the phi.
-    for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
-      if (ExitIRI->getOperand(0) != FOR)
+    for (VPUser *U : FOR->users()) {
+      using namespace llvm::VPlanPatternMatch;
+      if (!match(U, m_VPInstruction<VPInstruction::ExtractLastElement>(
+                        m_Specific(FOR))))
         continue;
       // For VF vscale x 1, if vscale = 1, we are unable to extract the
-      // penultimate value of the recurrence. Instead, we rely on function
-      // addUsersInExitBlocks to extract the last element from the result of
-      // VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
-      // recurrence phi in ExitUsersToFix.
+      // penultimate value of the recurrence. Instead we rely on the existing
+      // extract of the last element from the result of
+      // VPInstruction::FirstOrderRecurrenceSplice.
       // TODO: Consider vscale_range info and UF.
       if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
                                                              Range))
@@ -8590,8 +8551,7 @@ static void addExitUsersForFirstOrderRecurrences(
       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
           VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
           {}, "vector.recur.extract.for.phi");
-      ExitIRI->setOperand(0, PenultimateElement);
-      ExitUsersToFix.remove(ExitIRI);
+      cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
     }
   }
 }
@@ -8625,6 +8585,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()),
       Legal->hasUncountableEarlyExit(), Range);
   VPlanTransforms::createLoopRegions(*Plan);
+  VPlanTransforms::createExtractsForLiveOuts(*Plan);
 
   // Don't use getDecisionAndClampRange here, because we don't know the UF
   // so this function is better to be conservative, rather than to split
@@ -8797,12 +8758,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     R->setOperand(1, WideIV->getStepValue());
   }
 
+  addExitUsersForFirstOrderRecurrences(*Plan, Range);
   DenseMap<VPValue *, VPValue *> IVEndValues;
   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
-  SetVector<VPIRInstruction *> ExitUsersToFix =
-      collectUsersInLatchExitBlock(*Plan);
-  addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix, Range);
-  addUsersInExitBlocks(*Plan, ExitUsersToFix);
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c61e1135524b6..6ad5c60105a28 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5540,8 +5540,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
       return std::max(Entries[I].front()->getVectorFactor(),
                       Entries[I].back()->getVectorFactor());
     });
-  unsigned NumUndefs =
-      count_if(CurrentOrder, [&](unsigned Idx) { return Idx == NumScalars; });
+  unsigned NumUndefs = count(CurrentOrder, NumScalars);
   if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
     return std::nullopt;
   return std::move(CurrentOrder);
@@ -8623,11 +8622,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
                                State == LoadsState::CompressVectorize)
                              return false;
                            ConsecutiveNodesSize += VL.size();
-                           unsigned Start = std::distance(Slice.begin(), It);
-                           unsigned Sz = Slice.size() - Start;
+                           size_t Start = std::distance(Slice.begin(), It);
+                           size_t Sz = Slice.size() - Start;
                            return Sz < VL.size() ||
-                                  Slice.slice(std::distance(Slice.begin(), It),
-                                              VL.size()) != VL;
+                                  Slice.slice(Start, VL.size()) != VL;
                          }))
                 continue;
               // Try to build long masked gather loads.
@@ -11695,6 +11693,7 @@ void BoUpSLP::transformNodes() {
         if (StartIdx + VF > End)
           continue;
         SmallVector<std::pair<unsigned, unsigned>> Slices;
+        bool AllStrided = true;
         for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
           ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
           // If any instruction is vectorized already - do not try again.
@@ -11745,6 +11744,9 @@ void BoUpSLP::transformNodes() {
                 SmallVector<Value *> PointerOps;
                 LoadsState Res =
                     canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
+                AllStrided &= Res == LoadsState::StridedVectorize ||
+                              Res == LoadsState::ScatterVectorize ||
+                              Res == LoadsState::Gather;
                 // Do not vectorize gathers.
                 if (Res == LoadsState::ScatterVectorize ||
                     Res == LoadsState::Gather) {
@@ -11774,6 +11776,11 @@ void BoUpSLP::transformNodes() {
           }
           Slices.emplace_back(Cnt, Slice.size());
         }
+        // Do not try to vectorize if all slides are strided or gathered with
+        // vector factor 2 and there are more than 2 slices. Better to handle
+        // them in gathered loads analysis, may result in better vectorization.
+        if (VF == 2 && AllStrided && Slices.size() > 2)
+          continue;
         auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
           E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
           if (StartIdx == Cnt)
@@ -14560,8 +14567,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
 
-  unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
   SmallPtrSet<Value *, 4> CheckedExtracts;
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = *VectorizableTree[I];
@@ -14624,6 +14629,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   }
   SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
   for (ExternalUser &EU : ExternalUses) {
+    LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
+                      << EU.E.Idx << " in lane " << EU.Lane << "\n");
+    LLVM_DEBUG(dbgs() << "  User:" << *EU.User << "\n");
+    LLVM_DEBUG(dbgs() << "  Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
+
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
     // removed as well).
@@ -14731,6 +14741,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
     // for the extract and the added cost of the sign extend if needed.
     InstructionCost ExtraCost = TTI::TCC_Free;
     auto *ScalarTy = EU.Scalar->getType();
+    const unsigned BundleWidth = EU.E.getVectorFactor();
+    assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
     auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
     const TreeEntry *Entry = &EU.E;
     auto It = MinBWs.find(Entry);
@@ -14744,10 +14756,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
       VecTy = getWidenedType(MinTy, BundleWidth);
       ExtraCost =
           getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
+      LLVM_DEBUG(dbgs() << "  ExtractExtend or ExtractSubvec cost: "
+                        << ExtraCost << "\n");
     } else {
       ExtraCost =
           getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
                              CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
+      LLVM_DEBUG(dbgs() << "  ExtractElement cost for " << *ScalarTy << " from "
+                        << *VecTy << ": " << ExtraCost << "\n");
     }
     // Leave the scalar instructions as is if they are cheaper than extracts.
     if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
@@ -16214,7 +16230,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
     // debug location to Front.
     Builder.SetInsertPoint(
         LastInst->getParent(),
-        LastInst->getNextNonDebugInstruction()->getIterator());
+        LastInst->getNextNode()->getIterator());
   }
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
@@ -18966,7 +18982,7 @@ Value *BoUpSLP::vectorizeTree(
                   Builder.SetInsertPoint(
                       IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
               } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
-                Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
+                Builder.SetInsertPoint(IVec->getNextNode());
               }
               Vec = Builder.CreateIntCast(
                   Vec,
@@ -19944,7 +19960,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
         Instruction *PickedInst = BundleMember->getInst();
         if (!Scheduled.insert(PickedInst).second)
           continue;
-        if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+        if (PickedInst->getNextNode() != LastScheduledInst)
           PickedInst->moveAfter(LastScheduledInst->getPrevNode());
         LastScheduledInst = PickedInst;
       }
@@ -19953,7 +19969,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     } else {
       auto *SD = cast<ScheduleData>(Picked);
       Instruction *PickedInst = SD->getInst();
-      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+      if (PickedInst->getNextNode() != LastScheduledInst)
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
@@ -22131,7 +22147,7 @@ class HorizontalReduction {
     // Try to regroup reduced values so that it gets more profitable to try to
     // reduce them. Values are grouped by their value ids, instructions - by
     // instruction op id and/or alternate op id, plus do extra analysis for
-    // loads (grouping them by the distabce between pointers) and cmp
+    // loads (grouping them by the distance between pointers) and cmp
     // instructions (grouping them by the predicate).
     SmallMapVector<
         size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
@@ -22198,10 +22214,9 @@ class HorizontalReduction {
     for (auto &PossibleReducedVals : PossibleReducedValsVect) {
       auto PossibleRedVals = PossibleReducedVals.second.takeVector();
       SmallVector<SmallVector<Value *>> PossibleRedValsVect;
-      for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
-           It != E; ++It) {
+      for (auto &Slice : PossibleRedVals) {
         PossibleRedValsVect.emplace_back();
-        auto RedValsVect = It->second.takeVector();
+        auto RedValsVect = Slice.second.takeVector();
         stable_sort(RedValsVect, llvm::less_second());
         for (const std::pair<Value *, unsigned> &Data : RedValsVect)
           PossibleRedValsVect.back().append(Data.second, Data.first);
@@ -22361,8 +22376,8 @@ class HorizontalReduction {
       SmallVector<Value *> Candidates;
       Candidates.reserve(2 * OrigReducedVals.size());
       DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
-      for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
-        Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
+      for (Value *ReducedVal : OrigReducedVals) {
+        Value *RdxVal = TrackedVals.at(ReducedVal);
         // Check if the reduction value was not overriden by the extractelement
         // instruction because of the vectorization and exclude it, if it is not
         // compatible with other values.
@@ -22373,7 +22388,7 @@ class HorizontalReduction {
             (S && !Inst))
           continue;
         Candidates.push_back(RdxVal);
-        TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
+        TrackedToOrig.try_emplace(RdxVal, ReducedVal);
       }
       bool ShuffledExtracts = false;
       // Try to handle shuffled extractelements.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 85741b977bb77..204268e586b43 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -597,6 +597,7 @@ class VPIRFlags {
   enum class OperationType : unsigned char {
     Cmp,
     OverflowingBinOp,
+    Trunc,
     DisjointOp,
     PossiblyExactOp,
     GEPOp,
@@ -613,6 +614,13 @@ class VPIRFlags {
     WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
   };
 
+  struct TruncFlagsTy {
+    char HasNUW : 1;
+    char HasNSW : 1;
+
+    TruncFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
+  };
+
   struct DisjointFlagsTy {
     char IsDisjoint : 1;
     DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
@@ -644,6 +652,7 @@ class VPIRFlags {
   union {
     CmpInst::Predicate CmpPredicate;
     WrapFlagsTy WrapFlags;
+    TruncFlagsTy TruncFlags;
     DisjointFlagsTy DisjointFlags;
     ExactFlagsTy ExactFlags;
     GEPNoWrapFlags GEPFlags;
@@ -665,6 +674,9 @@ class VPIRFlags {
     } else if (auto *Op = dyn_cast<OverflowingBinaryOperator>(&I)) {
       OpType = OperationType::OverflowingBinOp;
       WrapFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()};
+    } else if (auto *Op = dyn_cast<TruncInst>(&I)) {
+      OpType = OperationType::Trunc;
+      TruncFlags = {Op->hasNoUnsignedWrap(), Op->hasNoSignedWrap()};
     } else if (auto *Op = dyn_cast<PossiblyExactOperator>(&I)) {
       OpType = OperationType::PossiblyExactOp;
       ExactFlags.IsExact = Op->isExact();
@@ -715,6 +727,10 @@ class VPIRFlags {
       WrapFlags.HasNUW = false;
       WrapFlags.HasNSW = false;
       break;
+    case OperationType::Trunc:
+      TruncFlags.HasNUW = false;
+      TruncFlags.HasNSW = false;
+      break;
     case OperationType::DisjointOp:
       DisjointFlags.IsDisjoint = false;
       break;
@@ -744,6 +760,10 @@ class VPIRFlags {
       I.setHasNoUnsignedWrap(WrapFlags.HasNUW);
       I.setHasNoSignedWrap(WrapFlags.HasNSW);
       break;
+    case OperationType::Trunc:
+      I.setHasNoUnsignedWrap(TruncFlags.HasNUW);
+      I.setHasNoSignedWrap(TruncFlags.HasNSW);
+      break;
     case OperationType::DisjointOp:
       cast<PossiblyDisjointInst>(&I)->setIsDisjoint(DisjointFlags.IsDisjoint);
       break;
@@ -800,15 +820,25 @@ class VPIRFlags {
   }
 
   bool hasNoUnsignedWrap() const {
-    assert(OpType == OperationType::OverflowingBinOp &&
-           "recipe doesn't have a NUW flag");
-    return WrapFlags.HasNUW;
+    switch (OpType) {
+    case OperationType::OverflowingBinOp:
+      return WrapFlags.HasNUW;
+    case OperationType::Trunc:
+      return TruncFlags.HasNUW;
+    default:
+      llvm_unreachable("recipe doesn't have a NUW flag");
+    }
   }
 
   bool hasNoSignedWrap() const {
-    assert(OpType == OperationType::OverflowingBinOp &&
-           "recipe doesn't have a NSW flag");
-    return WrapFlags.HasNSW;
+    switch (OpType) {
+    case OperationType::OverflowingBinOp:
+      return WrapFlags.HasNSW;
+    case OperationType::Trunc:
+      return TruncFlags.HasNSW;
+    default:
+      llvm_unreachable("recipe doesn't have a NSW flag");
+    }
   }
 
   bool isDisjoint() const {
@@ -1327,9 +1357,10 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
 
 public:
   VPWidenRecipe(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, DebugLoc DL)
+                const VPIRFlags &Flags, const VPIRMetadata &Metadata,
+                DebugLoc DL)
       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
-        Opcode(Opcode) {}
+        VPIRMetadata(Metadata), Opcode(Opcode) {}
 
   VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I),
@@ -1338,8 +1369,9 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
   ~VPWidenRecipe() override = default;
 
   VPWidenRecipe *clone() override {
-    auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands());
-    R->transferFlags(*this);
+    auto *R =
+        new VPWidenRecipe(getOpcode(), operands(), *this, *this, getDebugLoc());
+    R->setUnderlyingValue(getUnderlyingValue());
     return R;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 92db9674ef42b..b27a7ffeed208 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/GenericDomTreeConstruction.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 52eecb000d0c2..7fb5e82f9d32b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -591,6 +591,30 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
   TopRegion->getEntryBasicBlock()->setName("vector.body");
 }
 
+void VPlanTransforms::createExtractsForLiveOuts(VPlan &Plan) {
+  for (VPBasicBlock *EB : Plan.getExitBlocks()) {
+    VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
+    VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+
+    if (EB->getSinglePredecessor() != Plan.getMiddleBlock())
+      continue;
+
+    for (VPRecipeBase &R : EB->phis()) {
+      auto *ExitIRI = cast<VPIRPhi>(&R);
+      for (unsigned Idx = 0; Idx != ExitIRI->getNumIncoming(); ++Idx) {
+        VPRecipeBase *Inc = ExitIRI->getIncomingValue(Idx)->getDefiningRecipe();
+        if (!Inc || !Inc->getParent()->getParent())
+          continue;
+        assert(ExitIRI->getNumOperands() == 1 &&
+               ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
+               "exit values from early exits must be fixed when branch to "
+               "early-exit is added");
+        ExitIRI->extractLastLaneOfFirstOperand(B);
+      }
+    }
+  }
+}
+
 // Likelyhood of bypassing the vectorized loop due to a runtime check block,
 // including memory overlap checks block and wrapping/unit-stride checks block.
 static constexpr uint32_t CheckBypassWeights[] = {1, 127};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3c367664a0988..1664bcc3881aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,8 +67,10 @@ bool VPRecipeBase::mayWriteToMemory() const {
                 ->onlyReadsMemory();
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
+  case VPCanonicalIVPHISC:
   case VPBranchOnMaskSC:
   case VPFirstOrderRecurrencePHISC:
+  case VPReductionPHISC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
     return false;
@@ -1763,6 +1765,8 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
     return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
            Opcode == Instruction::Mul ||
            Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
+  case OperationType::Trunc:
+    return Opcode == Instruction::Trunc;
   case OperationType::DisjointOp:
     return Opcode == Instruction::Or;
   case OperationType::PossiblyExactOp:
@@ -1810,6 +1814,12 @@ void VPIRFlags::printFlags(raw_ostream &O) const {
     if (WrapFlags.HasNSW)
       O << " nsw";
     break;
+  case OperationType::Trunc:
+    if (TruncFlags.HasNUW)
+      O << " nuw";
+    if (TruncFlags.HasNSW)
+      O << " nsw";
+    break;
   case OperationType::FPMathOp:
     getFastMathFlags().print(O);
     break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 870b1bb68b79a..84a12470f45e4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -74,6 +74,10 @@ struct VPlanTransforms {
   /// flat CFG into a hierarchical CFG.
   LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);
 
+  /// Creates extracts for values in \p Plan defined in a loop region and used
+  /// outside a loop region.
+  LLVM_ABI_FOR_TEST static void createExtractsForLiveOuts(VPlan &Plan);
+
   /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
   /// VPValue and connect the block to \p Plan, using the VPValue as branch
   /// condition.
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
index 8b52b123d6d25..3fbd12ac8813c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll
@@ -27,20 +27,20 @@ define i32 @sadd(i32 %arg) {
 ; CHECK-LABEL: 'sadd'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:4 Lat:4 SizeLat:4 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:6 Lat:6 SizeLat:6 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:10 Lat:10 SizeLat:10 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:4 Lat:4 SizeLat:4 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:6 Lat:6 SizeLat:6 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:10 Lat:10 SizeLat:10 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:6 Lat:6 SizeLat:6 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:6 SizeLat:6 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:10 Lat:10 SizeLat:10 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
@@ -90,20 +90,20 @@ define i32 @uadd(i32 %arg) {
 ; CHECK-LABEL: 'uadd'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
@@ -153,20 +153,20 @@ define i32 @ssub(i32 %arg) {
 ; CHECK-LABEL: 'ssub'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:4 Lat:4 SizeLat:4 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:6 Lat:6 SizeLat:6 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:10 Lat:10 SizeLat:10 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:4 Lat:4 SizeLat:4 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:6 Lat:6 SizeLat:6 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:10 Lat:10 SizeLat:10 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:6 Lat:6 SizeLat:6 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:6 SizeLat:6 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:10 Lat:10 SizeLat:10 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
@@ -216,20 +216,20 @@ define i32 @usub(i32 %arg) {
 ; CHECK-LABEL: 'usub'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:2 Lat:2 SizeLat:2 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:3 SizeLat:3 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:5 Lat:5 SizeLat:5 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
@@ -279,20 +279,20 @@ define i32 @smul(i32 %arg) {
 ; CHECK-LABEL: 'smul'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:34 CodeSize:8 Lat:8 SizeLat:8 for: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:8 Lat:8 SizeLat:8 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:136 CodeSize:8 Lat:8 SizeLat:8 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:9 Lat:9 SizeLat:9 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:136 CodeSize:11 Lat:11 SizeLat:11 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:8 Lat:8 SizeLat:8 for: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:8 Lat:8 SizeLat:8 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:8 Lat:8 SizeLat:8 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:9 Lat:9 SizeLat:9 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:11 Lat:11 SizeLat:11 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:8 Lat:8 SizeLat:8 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:8 Lat:8 SizeLat:8 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:9 Lat:9 SizeLat:9 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:11 Lat:11 SizeLat:11 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:8 Lat:8 SizeLat:8 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:8 Lat:8 SizeLat:8 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:9 Lat:9 SizeLat:9 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:11 Lat:11 SizeLat:11 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
@@ -342,20 +342,20 @@ define i32 @umul(i32 %arg) {
 ; CHECK-LABEL: 'umul'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:7 Lat:7 SizeLat:7 for: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:7 Lat:7 SizeLat:7 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:7 Lat:7 SizeLat:7 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:8 Lat:8 SizeLat:8 for: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:10 Lat:10 SizeLat:10 for: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:7 Lat:7 SizeLat:7 for: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:7 Lat:7 SizeLat:7 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:148 CodeSize:7 Lat:7 SizeLat:7 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:8 Lat:8 SizeLat:8 for: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:148 CodeSize:10 Lat:10 SizeLat:10 for: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:7 SizeLat:7 for: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:7 SizeLat:7 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:7 Lat:7 SizeLat:7 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:8 Lat:8 SizeLat:8 for: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:10 Lat:10 SizeLat:10 for: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:7 Lat:7 SizeLat:7 for: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:7 Lat:7 SizeLat:7 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:7 Lat:7 SizeLat:7 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:8 Lat:8 SizeLat:8 for: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:10 Lat:10 SizeLat:10 for: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index b55a759a78cc3..10c995786d57a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -9,7 +9,7 @@ define void @cmps() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c16 = icmp ult i16 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c32 = icmp sge i32 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c64 = icmp ne i64 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c128 = icmp ult i128 undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %c128 = icmp ult i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cv16i8 = icmp slt <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cv8i16 = icmp ult <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cv4i32 = icmp sge <4 x i32> undef, undef
@@ -17,10 +17,10 @@ define void @cmps() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cf32 = fcmp ogt float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cf64 = fcmp ogt double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cbf64 = fcmp ogt bfloat undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cfv816 = fcmp olt <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cfv816 = fcmp olt <8 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cfv432 = fcmp oge <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cfv264 = fcmp oge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cbfv816 = fcmp olt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cbfv816 = fcmp olt <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c8 = icmp slt i8 undef, undef
@@ -62,7 +62,7 @@ define void @andcmp() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64leneg = icmp sle i64 %a64, -1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64gtneg = icmp sgt i64 %a64, -1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a128 = and i128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c128 = icmp eq i128 %a128, 0
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %c128 = icmp eq i128 %a128, 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av16i8 = and <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cv16i8 = icmp ne <16 x i8> %av16i8, zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av8i16 = and <8 x i16> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/fcmp.ll b/llvm/test/Analysis/CostModel/AArch64/fcmp.ll
index fb9888d98af6b..d2a1340f60460 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fcmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fcmp.ll
@@ -7,12 +7,12 @@ define void @fcmp_oeq(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp oeq float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oeq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oeq <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp oeq <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oeq <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp oeq double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oeq <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp oeq <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oeq <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp oeq fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp oeq <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp oeq <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp oeq float undef, undef
@@ -32,8 +32,8 @@ define void @fcmp_oeq_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp oeq half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp oeq <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp oeq <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp oeq <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp oeq <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp oeq <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp oeq <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_oeq_half'
@@ -41,7 +41,7 @@ define void @fcmp_oeq_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oeq <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oeq <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oeq <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp oeq <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oeq <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp oeq half undef, undef
@@ -57,8 +57,8 @@ define void @fcmp_oeq_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp oeq bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp oeq <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp oeq <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp oeq <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp oeq <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oeq <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp oeq <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp oeq bfloat undef, undef
@@ -74,12 +74,12 @@ define void @fcmp_ogt(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp ogt float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ogt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ogt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ogt <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ogt <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp ogt double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ogt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ogt <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ogt <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp ogt fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp ogt <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp ogt <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp ogt float undef, undef
@@ -99,8 +99,8 @@ define void @fcmp_ogt_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp ogt half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp ogt <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp ogt <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp ogt <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp ogt <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp ogt <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp ogt <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_ogt_half'
@@ -108,7 +108,7 @@ define void @fcmp_ogt_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ogt <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ogt <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ogt <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ogt <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ogt <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp ogt half undef, undef
@@ -124,8 +124,8 @@ define void @fcmp_ogt_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp ogt bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ogt <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ogt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ogt <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ogt <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ogt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ogt <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp ogt bfloat undef, undef
@@ -141,12 +141,12 @@ define void @fcmp_oge(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp oge float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oge <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp oge <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oge <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp oge double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp oge <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oge <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp oge fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp oge <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp oge <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp oge float undef, undef
@@ -166,8 +166,8 @@ define void @fcmp_oge_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp oge half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp oge <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp oge <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp oge <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp oge <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp oge <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp oge <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_oge_half'
@@ -175,7 +175,7 @@ define void @fcmp_oge_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oge <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oge <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oge <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp oge <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oge <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp oge half undef, undef
@@ -191,8 +191,8 @@ define void @fcmp_oge_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp oge bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp oge <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp oge <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp oge <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp oge <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oge <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp oge <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp oge bfloat undef, undef
@@ -208,12 +208,12 @@ define void @fcmp_olt(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp olt float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp olt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp olt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp olt <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp olt <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp olt double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp olt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp olt <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp olt <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp olt fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp olt <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp olt <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp olt float undef, undef
@@ -233,8 +233,8 @@ define void @fcmp_olt_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp olt half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp olt <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp olt <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp olt <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp olt <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp olt <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_olt_half'
@@ -242,7 +242,7 @@ define void @fcmp_olt_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp olt <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp olt <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp olt <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp olt <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp olt half undef, undef
@@ -258,8 +258,8 @@ define void @fcmp_olt_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp olt bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp olt <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp olt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp olt <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp olt <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp olt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp olt <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp olt bfloat undef, undef
@@ -275,12 +275,12 @@ define void @fcmp_ole(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp ole float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ole <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ole <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ole <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ole <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp ole double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ole <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ole <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ole <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp ole fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp ole <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp ole <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp ole float undef, undef
@@ -300,8 +300,8 @@ define void @fcmp_ole_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp ole half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp ole <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp ole <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp ole <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp ole <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp ole <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp ole <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_ole_half'
@@ -309,7 +309,7 @@ define void @fcmp_ole_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ole <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ole <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ole <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ole <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ole <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp ole half undef, undef
@@ -325,8 +325,8 @@ define void @fcmp_ole_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp ole bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ole <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ole <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ole <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ole <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ole <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ole <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp ole bfloat undef, undef
@@ -342,12 +342,12 @@ define void @fcmp_one(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp one float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp one <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp one <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp one <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp one <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp one double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp one <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp one <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp one <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp one fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp one <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp one <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp one float undef, undef
@@ -367,8 +367,8 @@ define void @fcmp_one_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp one half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp one <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp one <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp one <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp one <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp one <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp one <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_one_half'
@@ -376,7 +376,7 @@ define void @fcmp_one_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp one <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp one <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp one <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp one <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp one <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp one half undef, undef
@@ -392,8 +392,8 @@ define void @fcmp_one_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp one bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp one <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp one <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp one <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp one <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp one <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp one <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp one bfloat undef, undef
@@ -409,12 +409,12 @@ define void @fcmp_ord(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp ord float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ord <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ord <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ord <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ord <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp ord double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ord <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ord <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ord <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp ord fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp ord <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp ord <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp ord float undef, undef
@@ -434,8 +434,8 @@ define void @fcmp_ord_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp ord half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp ord <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp ord <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp ord <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp ord <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp ord <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp ord <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_ord_half'
@@ -443,7 +443,7 @@ define void @fcmp_ord_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ord <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ord <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ord <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ord <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ord <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp ord half undef, undef
@@ -459,8 +459,8 @@ define void @fcmp_ord_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp ord bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ord <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ord <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ord <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ord <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ord <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ord <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp ord bfloat undef, undef
@@ -476,12 +476,12 @@ define void @fcmp_ueq(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp ueq float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ueq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ueq <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ueq <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ueq <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp ueq double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ueq <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ueq <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ueq <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp ueq fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp ueq <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp ueq <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp ueq float undef, undef
@@ -501,8 +501,8 @@ define void @fcmp_ueq_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp ueq half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp ueq <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp ueq <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp ueq <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp ueq <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp ueq <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp ueq <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_ueq_half'
@@ -510,7 +510,7 @@ define void @fcmp_ueq_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ueq <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ueq <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ueq <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ueq <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ueq <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp ueq half undef, undef
@@ -526,8 +526,8 @@ define void @fcmp_ueq_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp ueq bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ueq <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ueq <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ueq <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ueq <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ueq <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ueq <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp ueq bfloat undef, undef
@@ -543,12 +543,12 @@ define void @fcmp_ugt(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp ugt float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ugt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ugt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ugt <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ugt <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp ugt double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ugt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ugt <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ugt <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp ugt fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp ugt <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp ugt <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp ugt float undef, undef
@@ -568,8 +568,8 @@ define void @fcmp_ugt_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp ugt half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp ugt <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp ugt <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp ugt <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp ugt <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp ugt <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp ugt <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_ugt_half'
@@ -577,7 +577,7 @@ define void @fcmp_ugt_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ugt <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ugt <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ugt <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ugt <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ugt <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp ugt half undef, undef
@@ -593,8 +593,8 @@ define void @fcmp_ugt_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp ugt bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ugt <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ugt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ugt <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ugt <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ugt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ugt <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp ugt bfloat undef, undef
@@ -610,12 +610,12 @@ define void @fcmp_uge(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp uge float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uge <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp uge <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uge <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp uge double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp uge <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uge <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp uge fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp uge <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp uge <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp uge float undef, undef
@@ -635,8 +635,8 @@ define void @fcmp_uge_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp uge half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp uge <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp uge <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp uge <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp uge <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp uge <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp uge <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_uge_half'
@@ -644,7 +644,7 @@ define void @fcmp_uge_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uge <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uge <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uge <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp uge <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uge <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp uge half undef, undef
@@ -660,8 +660,8 @@ define void @fcmp_uge_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp uge bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp uge <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp uge <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp uge <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp uge <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uge <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp uge <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp uge bfloat undef, undef
@@ -677,12 +677,12 @@ define void @fcmp_ult(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp ult float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ult <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ult <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ult <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ult <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp ult double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ult <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ult <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ult <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp ult fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp ult <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp ult <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp ult float undef, undef
@@ -702,8 +702,8 @@ define void @fcmp_ult_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp ult half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp ult <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp ult <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp ult <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp ult <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp ult <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp ult <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_ult_half'
@@ -711,7 +711,7 @@ define void @fcmp_ult_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ult <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ult <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ult <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ult <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ult <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp ult half undef, undef
@@ -727,8 +727,8 @@ define void @fcmp_ult_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp ult bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ult <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ult <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ult <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ult <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ult <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ult <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp ult bfloat undef, undef
@@ -744,12 +744,12 @@ define void @fcmp_ule(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp ule float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ule <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ule <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ule <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ule <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp ule double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ule <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ule <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ule <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp ule fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp ule <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp ule <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp ule float undef, undef
@@ -769,8 +769,8 @@ define void @fcmp_ule_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp ule half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp ule <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp ule <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp ule <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp ule <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp ule <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp ule <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_ule_half'
@@ -778,7 +778,7 @@ define void @fcmp_ule_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ule <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ule <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ule <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ule <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ule <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp ule half undef, undef
@@ -794,8 +794,8 @@ define void @fcmp_ule_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp ule bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ule <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ule <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ule <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ule <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ule <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ule <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp ule bfloat undef, undef
@@ -811,12 +811,12 @@ define void @fcmp_une(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp une float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp une <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp une <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp une <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp une <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp une double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp une <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp une <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp une <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp une fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp une <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp une <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp une float undef, undef
@@ -836,8 +836,8 @@ define void @fcmp_une_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp une half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp une <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp une <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp une <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp une <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp une <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp une <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_une_half'
@@ -845,7 +845,7 @@ define void @fcmp_une_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp une <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp une <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp une <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp une <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp une <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp une half undef, undef
@@ -861,8 +861,8 @@ define void @fcmp_une_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp une bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp une <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp une <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp une <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp une <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp une <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp une <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp une bfloat undef, undef
@@ -878,12 +878,12 @@ define void @fcmp_uno(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp uno float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uno <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uno <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp uno <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uno <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp uno double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uno <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp uno <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uno <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp uno fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp uno <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp uno <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp uno float undef, undef
@@ -903,8 +903,8 @@ define void @fcmp_uno_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp uno half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp uno <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp uno <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp uno <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp uno <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp uno <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp uno <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_uno_half'
@@ -912,7 +912,7 @@ define void @fcmp_uno_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uno <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uno <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uno <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp uno <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uno <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp uno half undef, undef
@@ -928,8 +928,8 @@ define void @fcmp_uno_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp uno bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp uno <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp uno <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp uno <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp uno <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uno <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp uno <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp uno bfloat undef, undef
@@ -945,12 +945,12 @@ define void @fcmp_true(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp true float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp true <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp true <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp true <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp true <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp true double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp true <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp true <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp true <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp true fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp true <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp true <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp true float undef, undef
@@ -970,8 +970,8 @@ define void @fcmp_true_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp true half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp true <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp true <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp true <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp true <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp true <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp true <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_true_half'
@@ -979,7 +979,7 @@ define void @fcmp_true_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp true <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp true <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp true <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp true <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp true <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp true half undef, undef
@@ -995,8 +995,8 @@ define void @fcmp_true_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp true bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp true <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp true <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp true <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp true <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp true <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp true <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp true bfloat undef, undef
@@ -1012,12 +1012,12 @@ define void @fcmp_false(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f32 = fcmp false float undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp false <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp false <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp false <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp false <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f64 = fcmp false double undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp false <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp false <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp false <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f128 = fcmp false fp128 undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2f128 = fcmp false <2 x fp128> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v2f128 = fcmp false <2 x fp128> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32 = fcmp false float undef, undef
@@ -1037,8 +1037,8 @@ define void @fcmp_false_half(i32 %arg) {
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %f16 = fcmp false half undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 3 for: %v2f16 = fcmp false <2 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %v4f16 = fcmp false <4 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8f16 = fcmp false <8 x half> undef, undef
-; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16f16 = fcmp false <16 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8f16 = fcmp false <8 x half> undef, undef
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16f16 = fcmp false <16 x half> undef, undef
 ; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fcmp_false_half'
@@ -1046,7 +1046,7 @@ define void @fcmp_false_half(i32 %arg) {
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp false <2 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp false <4 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp false <8 x half> undef, undef
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp false <16 x half> undef, undef
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp false <16 x half> undef, undef
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16 = fcmp false half undef, undef
@@ -1062,8 +1062,8 @@ define void @fcmp_false_bfloat(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bf16 = fcmp false bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp false <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp false <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp false <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp false <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp false <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp false <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %bf16 = fcmp false bfloat undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
index 59e86aefeeb6c..9d06b4bdec9b4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -236,7 +236,7 @@ declare <4 x i30> @llvm.fshl.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
 
 define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:6 Lat:6 SizeLat:6 for: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %fshl
 ;
 entry:
@@ -259,7 +259,7 @@ declare i66 @llvm.fshl.i66(i66, i66, i66)
 
 define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:6 Lat:6 SizeLat:6 for: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %fshl
 ;
 entry:
@@ -270,7 +270,7 @@ declare <2 x i128> @llvm.fshl.v4i128(<2 x i128>, <2 x i128>, <2 x i128>)
 
 define i128 @fshl_i128(i128 %a, i128 %b) {
 ; CHECK-LABEL: 'fshl_i128'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %fshl = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %fshl = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %fshl
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
index 53cede900c398..b31806b647868 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -236,7 +236,7 @@ declare <4 x i30> @llvm.fshr.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
 
 define <2 x i66> @fshr_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshr_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:6 Lat:6 SizeLat:6 for: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %fshr
 ;
 entry:
@@ -259,7 +259,7 @@ declare i66 @llvm.fshr.i66(i66, i66, i66)
 
 define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:6 Lat:6 SizeLat:6 for: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %fshr
 ;
 entry:
@@ -270,7 +270,7 @@ declare <2 x i128> @llvm.fshr.v4i128(<2 x i128>, <2 x i128>, <2 x i128>)
 
 define i128 @fshr_i128(i128 %a, i128 %b) {
 ; CHECK-LABEL: 'fshr_i128'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:6 SizeLat:6 for: %fshr = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %fshr = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %fshr
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
index b484f8f6c60ba..b221fc8a35ab3 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
@@ -13,16 +13,16 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
index 519b8ecf6dc76..4bb59e3a09b7a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
@@ -13,16 +13,16 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
index 2a8609d2f418b..8e81aadbb9934 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
@@ -13,16 +13,16 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index 41c272291d7ca..4579acb9b3555 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -93,36 +93,36 @@ define void @insert_subvec() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -369,7 +369,7 @@ define void @multipart() {
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
@@ -408,10 +408,10 @@ define void @vst3(ptr %p) {
 ; CHECK-LABEL: 'vst3'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found costs of 24 for: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
@@ -452,10 +452,10 @@ define void @vst4(ptr %p) {
 ; CHECK-LABEL: 'vst4'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 32 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index 09f116f01ec77..4a003a0085c23 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v8i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0
 ;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -14,7 +14,7 @@ define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 
 define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v16i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:30 Lat:60 SizeLat:60 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0
 ;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -32,7 +32,7 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 
 define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'sel_v8i16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0
 ;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll b/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll
index 9a0e5c3eb7964..83f62c8b44303 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll
@@ -22,7 +22,7 @@ define void @cmp_legal_int() {
 ; Check icmp for an illegal integer vector.
 define <vscale x 4 x i1> @cmp_nxv4i64() {
 ; CHECK-LABEL: 'cmp_nxv4i64'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = icmp ne <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %res = icmp ne <vscale x 4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 4 x i1> %res
 ;
   %res = icmp ne <vscale x 4 x i64> undef, undef
@@ -48,7 +48,7 @@ define void @cmp_legal_pred() {
 ; Check icmp for an illegal predicate vector.
 define <vscale x 32 x i1> @cmp_nxv32i1() {
 ; CHECK-LABEL: 'cmp_nxv32i1'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = icmp ne <vscale x 32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %res = icmp ne <vscale x 32 x i1> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 32 x i1> %res
 ;
   %res = icmp ne <vscale x 32 x i1> undef, undef
@@ -74,7 +74,7 @@ define void @cmp_legal_fp() #0 {
 ; Check fcmp for an illegal FP vector
 define <vscale x 16 x i1> @cmp_nxv16f16() {
 ; CHECK-LABEL: 'cmp_nxv16f16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = fcmp oge <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %res = fcmp oge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 16 x i1> %res
 ;
   %res = fcmp oge <vscale x 16 x half> undef, undef
@@ -100,7 +100,7 @@ define void @sel_legal_int() {
 ; Check select for an illegal integer vector
 define <vscale x 16 x i16> @sel_nxv16i16() {
 ; CHECK-LABEL: 'sel_nxv16i16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = select <vscale x 16 x i1> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %res = select <vscale x 16 x i1> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 16 x i16> %res
 ;
   %res = select <vscale x 16 x i1> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef
@@ -126,7 +126,7 @@ define void @sel_legal_fp() #0 {
 ; Check select for an illegal FP vector
 define <vscale x 8 x float> @sel_nxv8f32() {
 ; CHECK-LABEL: 'sel_nxv8f32'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = select <vscale x 8 x i1> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %res = select <vscale x 8 x i1> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 8 x float> %res
 ;
   %res = select <vscale x 8 x i1> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef
@@ -152,7 +152,7 @@ define void @sel_legal_pred() {
 ; Check select for an illegal predicate vector
 define <vscale x 32 x i1> @sel_nxv32i1() {
 ; CHECK-LABEL: 'sel_nxv32i1'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = select <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %res = select <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 32 x i1> %res
 ;
   %res = select <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
index c055d3218f65b..84a9ca0fab6d8 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
@@ -10,6 +10,7 @@ define void @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V6i32 = sdiv <6 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i32 = sdiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, undef
@@ -19,6 +20,7 @@ define void @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V6i8 = sdiv <6 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, undef
@@ -48,6 +50,7 @@ define void @sdiv() {
   %V8i64 = sdiv <8 x i64> undef, undef
   %V2i32 = sdiv <2 x i32> undef, undef
   %V4i32 = sdiv <4 x i32> undef, undef
+  %V6i32 = sdiv <6 x i32> undef, undef
   %V8i32 = sdiv <8 x i32> undef, undef
   %V16i32 = sdiv <16 x i32> undef, undef
   %V2i16 = sdiv <2 x i16> undef, undef
@@ -57,6 +60,7 @@ define void @sdiv() {
   %V32i16 = sdiv <32 x i16> undef, undef
   %V2i8 = sdiv <2 x i8> undef, undef
   %V4i8 = sdiv <4 x i8> undef, undef
+  %V6i8 = sdiv <6 x i8> undef, undef
   %V8i8 = sdiv <8 x i8> undef, undef
   %V16i8 = sdiv <16 x i8> undef, undef
   %V32i8 = sdiv <32 x i8> undef, undef
@@ -89,6 +93,7 @@ define void @udiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = udiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V6i32 = udiv <6 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i32 = udiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = udiv <2 x i16> undef, undef
@@ -98,6 +103,7 @@ define void @udiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = udiv <2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = udiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V6i8 = udiv <6 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = udiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = udiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = udiv <32 x i8> undef, undef
@@ -127,6 +133,7 @@ define void @udiv() {
   %V8i64 = udiv <8 x i64> undef, undef
   %V2i32 = udiv <2 x i32> undef, undef
   %V4i32 = udiv <4 x i32> undef, undef
+  %V6i32 = udiv <6 x i32> undef, undef
   %V8i32 = udiv <8 x i32> undef, undef
   %V16i32 = udiv <16 x i32> undef, undef
   %V2i16 = udiv <2 x i16> undef, undef
@@ -136,6 +143,7 @@ define void @udiv() {
   %V32i16 = udiv <32 x i16> undef, undef
   %V2i8 = udiv <2 x i8> undef, undef
   %V4i8 = udiv <4 x i8> undef, undef
+  %V6i8 = udiv <6 x i8> undef, undef
   %V8i8 = udiv <8 x i8> undef, undef
   %V16i8 = udiv <16 x i8> undef, undef
   %V32i8 = udiv <32 x i8> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
index 7fdd6b4971658..9801d14bcc6cd 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
@@ -5,13 +5,13 @@ define void @fcmp_oeq(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oeq'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oeq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oeq <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp oeq <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oeq <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oeq <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp oeq <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oeq <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oeq <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oeq <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oeq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp oeq <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oeq <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp oeq <2 x float> undef, undef
@@ -30,7 +30,7 @@ define void @fcmp_oeq_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oeq_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp oeq <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp oeq <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp oeq <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oeq <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp oeq <2 x bfloat> undef, undef
@@ -43,13 +43,13 @@ define void @fcmp_ogt(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ogt'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ogt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ogt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ogt <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ogt <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ogt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ogt <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ogt <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ogt <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ogt <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ogt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ogt <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ogt <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ogt <2 x float> undef, undef
@@ -68,7 +68,7 @@ define void @fcmp_ogt_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ogt_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ogt <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ogt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ogt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ogt <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ogt <2 x bfloat> undef, undef
@@ -81,13 +81,13 @@ define void @fcmp_oge(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oge'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oge <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp oge <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oge <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp oge <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oge <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oge <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oge <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp oge <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oge <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp oge <2 x float> undef, undef
@@ -106,7 +106,7 @@ define void @fcmp_oge_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oge_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp oge <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp oge <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp oge <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oge <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp oge <2 x bfloat> undef, undef
@@ -119,13 +119,13 @@ define void @fcmp_olt(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_olt'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp olt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp olt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp olt <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp olt <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp olt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp olt <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp olt <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp olt <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp olt <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp olt <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp olt <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp olt <2 x float> undef, undef
@@ -144,7 +144,7 @@ define void @fcmp_olt_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_olt_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp olt <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp olt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp olt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp olt <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp olt <2 x bfloat> undef, undef
@@ -157,13 +157,13 @@ define void @fcmp_ole(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ole'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ole <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ole <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ole <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ole <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ole <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ole <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ole <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ole <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ole <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ole <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ole <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ole <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ole <2 x float> undef, undef
@@ -182,7 +182,7 @@ define void @fcmp_ole_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ole_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ole <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ole <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ole <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ole <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ole <2 x bfloat> undef, undef
@@ -195,13 +195,13 @@ define void @fcmp_one(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_one'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp one <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp one <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp one <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp one <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp one <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp one <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp one <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp one <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp one <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp one <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp one <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp one <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp one <2 x float> undef, undef
@@ -220,7 +220,7 @@ define void @fcmp_one_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_one_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp one <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp one <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp one <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp one <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp one <2 x bfloat> undef, undef
@@ -233,13 +233,13 @@ define void @fcmp_ord(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ord'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ord <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ord <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ord <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ord <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ord <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ord <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ord <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ord <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ord <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ord <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ord <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ord <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ord <2 x float> undef, undef
@@ -258,7 +258,7 @@ define void @fcmp_ord_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ord_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ord <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ord <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ord <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ord <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ord <2 x bfloat> undef, undef
@@ -271,13 +271,13 @@ define void @fcmp_ueq(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ueq'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ueq <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ueq <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ueq <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ueq <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ueq <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ueq <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ueq <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ueq <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ueq <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ueq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ueq <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ueq <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ueq <2 x float> undef, undef
@@ -296,7 +296,7 @@ define void @fcmp_ueq_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ueq_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ueq <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ueq <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ueq <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ueq <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ueq <2 x bfloat> undef, undef
@@ -309,13 +309,13 @@ define void @fcmp_ugt(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ugt'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ugt <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ugt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ugt <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ugt <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ugt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ugt <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ugt <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ugt <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ugt <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ugt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ugt <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ugt <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ugt <2 x float> undef, undef
@@ -334,7 +334,7 @@ define void @fcmp_ugt_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ugt_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ugt <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ugt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ugt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ugt <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ugt <2 x bfloat> undef, undef
@@ -347,13 +347,13 @@ define void @fcmp_uge(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uge'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uge <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uge <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp uge <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uge <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp uge <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uge <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uge <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uge <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp uge <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uge <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp uge <2 x float> undef, undef
@@ -372,7 +372,7 @@ define void @fcmp_uge_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uge_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp uge <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp uge <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp uge <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uge <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp uge <2 x bfloat> undef, undef
@@ -385,13 +385,13 @@ define void @fcmp_ult(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ult'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ult <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ult <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ult <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ult <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ult <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ult <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ult <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ult <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ult <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ult <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ult <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ult <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ult <2 x float> undef, undef
@@ -410,7 +410,7 @@ define void @fcmp_ult_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ult_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ult <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ult <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ult <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ult <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ult <2 x bfloat> undef, undef
@@ -423,13 +423,13 @@ define void @fcmp_ule(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ule'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ule <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ule <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp ule <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ule <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ule <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp ule <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ule <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ule <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ule <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ule <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp ule <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ule <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp ule <2 x float> undef, undef
@@ -448,8 +448,8 @@ define void @fcmp_ule_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ule_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ule <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ule <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp ule <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp ule <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ule <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ule <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp ule <2 x bfloat> undef, undef
@@ -463,13 +463,13 @@ define void @fcmp_une(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_une'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp une <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp une <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp une <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp une <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp une <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp une <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp une <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp une <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp une <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp une <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp une <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp une <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp une <2 x float> undef, undef
@@ -488,8 +488,8 @@ define void @fcmp_une_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_une_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp une <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp une <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp une <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp une <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp une <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp une <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp une <2 x bfloat> undef, undef
@@ -503,13 +503,13 @@ define void @fcmp_uno(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uno'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uno <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uno <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp uno <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uno <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uno <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp uno <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uno <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uno <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uno <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uno <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp uno <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uno <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp uno <2 x float> undef, undef
@@ -528,8 +528,8 @@ define void @fcmp_uno_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uno_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp uno <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp uno <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp uno <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp uno <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uno <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp uno <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp uno <2 x bfloat> undef, undef
@@ -543,13 +543,13 @@ define void @fcmp_true(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_true'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp true <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp true <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp true <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp true <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp true <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp true <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp true <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp true <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp true <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp true <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp true <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp true <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp true <2 x float> undef, undef
@@ -568,8 +568,8 @@ define void @fcmp_true_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_true_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp true <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp true <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp true <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp true <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp true <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp true <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp true <2 x bfloat> undef, undef
@@ -583,13 +583,13 @@ define void @fcmp_false(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_false'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp false <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp false <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v8f32 = fcmp false <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp false <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp false <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v4f64 = fcmp false <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp false <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp false <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp false <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp false <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16f16 = fcmp false <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp false <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2f32 = fcmp false <2 x float> undef, undef
@@ -608,8 +608,8 @@ define void @fcmp_false_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_false_bfloat'
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp false <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp false <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %v8bf16 = fcmp false <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %v16bf16 = fcmp false <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp false <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp false <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2bf16 = fcmp false <2 x bfloat> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index ec84c58bf9681..ee485e2f861b4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -7,15 +7,15 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
 ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -44,7 +44,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -56,7 +56,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -101,7 +101,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -113,7 +113,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -930,8 +930,8 @@ define void @get_lane_mask() #0 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:33 Lat:33 SizeLat:33 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:5 SizeLat:5 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'get_lane_mask'
@@ -953,8 +953,8 @@ define void @get_lane_mask() #0 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:33 Lat:33 SizeLat:33 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:5 SizeLat:5 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
@@ -976,8 +976,8 @@ define void @get_lane_mask() #0 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:33 Lat:33 SizeLat:33 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:5 SizeLat:5 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
@@ -1364,34 +1364,34 @@ define void @match() #3 {
 ; CHECK-VSCALE-1-LABEL: 'match'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'match'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'match'
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index a317d6da4028d..29f7fc938e62e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -83,7 +83,7 @@ define <2 x i64> @v2i64_select_sle(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 
 define <3 x i64> @v3i64_select_sle(<3 x i64> %a, <3 x i64> %b, <3 x i64> %c) {
 ; COST-LABEL: 'v3i64_select_sle'
-; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = icmp sle <3 x i64> %a, %b
+; COST-NEXT:  Cost Model: Found costs of 2 for: %cmp.1 = icmp sle <3 x i64> %a, %b
 ; COST-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <3 x i1> %cmp.1, <3 x i64> %a, <3 x i64> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <3 x i64> %s.1
 ;
@@ -94,7 +94,7 @@ define <3 x i64> @v3i64_select_sle(<3 x i64> %a, <3 x i64> %b, <3 x i64> %c) {
 
 define <2 x i64> @v2i64_select_no_cmp(<2 x i64> %a, <2 x i64> %b, <2 x i1> %cond) {
 ; COST-LABEL: 'v2i64_select_no_cmp'
-; COST-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
+; COST-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:6 SizeLat:6 for: %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %s.1
 ;
   %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b
@@ -119,7 +119,7 @@ define <4 x half> @v4f16_select_ogt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_ogt'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp ogt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp ogt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -169,7 +169,7 @@ define <2 x double> @v2f64_select_ogt(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_ogt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_ogt'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp ogt <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp ogt <4 x bfloat> %a, %b
@@ -179,8 +179,8 @@ define <4 x bfloat> @v4bf16_select_ogt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_ogt(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_ogt'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp ogt <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp ogt <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp ogt <8 x bfloat> %a, %b
@@ -206,7 +206,7 @@ define <4 x half> @v4f16_select_oge(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_oge'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp oge <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp oge <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -256,7 +256,7 @@ define <2 x double> @v2f64_select_oge(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_oge(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_oge'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp oge <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp oge <4 x bfloat> %a, %b
@@ -266,8 +266,8 @@ define <4 x bfloat> @v4bf16_select_oge(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_oge(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_oge'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp oge <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp oge <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp oge <8 x bfloat> %a, %b
@@ -293,7 +293,7 @@ define <4 x half> @v4f16_select_olt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_olt'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp olt <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp olt <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -343,7 +343,7 @@ define <2 x double> @v2f64_select_olt(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_olt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_olt'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp olt <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp olt <4 x bfloat> %a, %b
@@ -353,8 +353,8 @@ define <4 x bfloat> @v4bf16_select_olt(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_olt(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_olt'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp olt <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp olt <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp olt <8 x bfloat> %a, %b
@@ -380,7 +380,7 @@ define <4 x half> @v4f16_select_ole(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_ole'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp ole <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp ole <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -430,7 +430,7 @@ define <2 x double> @v2f64_select_ole(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_ole(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_ole'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp ole <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp ole <4 x bfloat> %a, %b
@@ -440,8 +440,8 @@ define <4 x bfloat> @v4bf16_select_ole(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_ole(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_ole'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp ole <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp ole <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp ole <8 x bfloat> %a, %b
@@ -467,7 +467,7 @@ define <4 x half> @v4f16_select_oeq(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_oeq'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp oeq <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp oeq <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -517,7 +517,7 @@ define <2 x double> @v2f64_select_oeq(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_oeq(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_oeq'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp oeq <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp oeq <4 x bfloat> %a, %b
@@ -527,8 +527,8 @@ define <4 x bfloat> @v4bf16_select_oeq(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_oeq(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_oeq'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp oeq <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp oeq <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp oeq <8 x bfloat> %a, %b
@@ -554,7 +554,7 @@ define <4 x half> @v4f16_select_one(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_one'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp one <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp one <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -604,7 +604,7 @@ define <2 x double> @v2f64_select_one(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_one(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_one'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp one <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp one <4 x bfloat> %a, %b
@@ -614,8 +614,8 @@ define <4 x bfloat> @v4bf16_select_one(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_one(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_one'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp one <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp one <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp one <8 x bfloat> %a, %b
@@ -641,7 +641,7 @@ define <4 x half> @v4f16_select_une(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_une'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp une <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp une <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -691,7 +691,7 @@ define <2 x double> @v2f64_select_une(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_une(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_une'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp une <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp une <4 x bfloat> %a, %b
@@ -701,8 +701,8 @@ define <4 x bfloat> @v4bf16_select_une(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_une(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_une'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp une <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp une <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp une <8 x bfloat> %a, %b
@@ -728,7 +728,7 @@ define <4 x half> @v4f16_select_ord(<4 x half> %a, <4 x half> %b, <4 x half> %c)
 
 define <8 x half> @v8f16_select_ord(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; COST-NOFP16-LABEL: 'v8f16_select_ord'
-; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp ord <8 x half> %a, %b
+; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp ord <8 x half> %a, %b
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of 2 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
 ; COST-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
 ;
@@ -778,7 +778,7 @@ define <2 x double> @v2f64_select_ord(<2 x double> %a, <2 x double> %b, <2 x dou
 define <4 x bfloat> @v4bf16_select_ord(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
 ; COST-LABEL: 'v4bf16_select_ord'
 ; COST-NEXT:  Cost Model: Found costs of 4 for: %cmp.1 = fcmp ord <4 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %s.1 = select <4 x i1> %cmp.1, <4 x bfloat> %a, <4 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %s.1
 ;
   %cmp.1 = fcmp ord <4 x bfloat> %a, %b
@@ -788,8 +788,8 @@ define <4 x bfloat> @v4bf16_select_ord(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bf
 
 define <8 x bfloat> @v8bf16_select_ord(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
 ; COST-LABEL: 'v8bf16_select_ord'
-; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %cmp.1 = fcmp ord <8 x bfloat> %a, %b
-; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
+; COST-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %cmp.1 = fcmp ord <8 x bfloat> %a, %b
+; COST-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:15 Lat:22 SizeLat:22 for: %s.1 = select <8 x i1> %cmp.1, <8 x bfloat> %a, <8 x bfloat> %c
 ; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %s.1
 ;
   %cmp.1 = fcmp ord <8 x bfloat> %a, %b
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll
index 781eba144205d..00ce9680cab32 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll
@@ -24,8 +24,8 @@ define i32 @reduce_i1(i32 %arg) {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 257 for instruction: %V256 = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V256 = call i1 @llvm.vector.reduce.and.v256i1(<256 x i1> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll
index 9e03620e2aa6f..eccaffc9d2a40 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll
@@ -24,8 +24,8 @@ define i32 @reduce_i1(i32 %arg) {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 257 for instruction: %V256 = call i1 @llvm.vector.reduce.or.v256i1(<256 x i1> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V256 = call i1 @llvm.vector.reduce.or.v256i1(<256 x i1> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1   = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
index 484cd201d14a0..9c2d6bc20d302 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
@@ -108,27 +108,27 @@ define i32 @sadd(i32 %arg) {
 ; NEON-SIZE-LABEL: 'sadd'
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sadd'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 289 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@@ -266,27 +266,27 @@ define i32 @uadd(i32 %arg) {
 ; NEON-SIZE-LABEL: 'uadd'
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'uadd'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@@ -424,27 +424,27 @@ define i32 @ssub(i32 %arg) {
 ; NEON-SIZE-LABEL: 'ssub'
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'ssub'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 289 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@@ -582,27 +582,27 @@ define i32 @usub(i32 %arg) {
 ; NEON-SIZE-LABEL: 'usub'
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'usub'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@@ -720,47 +720,47 @@ define i32 @smul(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'smul'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 322 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'smul'
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'smul'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 109 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 213 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 328 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@@ -878,47 +878,47 @@ define i32 @umul(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'umul'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'umul'
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'umul'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 149 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
index da7096af4d80c..34397bd8bfb4b 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
@@ -112,31 +112,31 @@ define i32 @add(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'add'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -162,22 +162,22 @@ define i32 @add(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'add'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 185 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 369 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
@@ -318,31 +318,31 @@ define i32 @sub(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'sub'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -368,22 +368,22 @@ define i32 @sub(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'sub'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 185 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 369 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
index 8a12ca2df234e..28e4860c8810c 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
@@ -112,31 +112,31 @@ define i32 @add(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'add'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -162,22 +162,22 @@ define i32 @add(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'add'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
@@ -318,31 +318,31 @@ define i32 @sub(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'sub'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -368,22 +368,22 @@ define i32 @sub(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'sub'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/cmps.ll b/llvm/test/Analysis/CostModel/ARM/cmps.ll
index e86a1b95a7b36..090fccee81bbe 100644
--- a/llvm/test/Analysis/CostModel/ARM/cmps.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cmps.ll
@@ -96,7 +96,7 @@ define i32 @cmps() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a12 = fcmp oge <2 x double> undef, undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p = icmp eq ptr undef, undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q = icmp eq <4 x ptr> undef, undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
@@ -106,17 +106,17 @@ define i32 @cmps() {
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %g = icmp sge <4 x i32> undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a12 = fcmp oge <2 x double> undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p = icmp eq ptr undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q = icmp eq <4 x ptr> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q = icmp eq <4 x ptr> undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-BASE-SIZE-LABEL: 'cmps'
@@ -124,17 +124,17 @@ define i32 @cmps() {
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = icmp ult i16 undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = icmp sge i32 undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = icmp ne i64 undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = icmp slt <16 x i8> undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = icmp ult <8 x i16> undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = icmp sge <4 x i32> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %e = icmp slt <16 x i8> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %f = icmp ult <8 x i16> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %g = icmp sge <4 x i32> undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %a11 = fcmp oge <4 x float> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %a12 = fcmp oge <2 x double> undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p = icmp eq ptr undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q = icmp eq <4 x ptr> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %q = icmp eq <4 x ptr> undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8R-SIZE-LABEL: 'cmps'
@@ -148,9 +148,9 @@ define i32 @cmps() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a7 = fcmp oge half undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a8 = fcmp ogt float undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a10 = fcmp olt <8 x half> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %a10 = fcmp olt <8 x half> undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %a12 = fcmp oge <2 x double> undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p = icmp eq ptr undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q = icmp eq <4 x ptr> undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
@@ -257,11 +257,11 @@ define void @minmax() {
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
-; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s6 = select <4 x i1> %c6, <4 x ptr> undef, <4 x ptr> undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
+; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %s6 = select <4 x i1> %c6, <4 x ptr> undef, <4 x ptr> undef
 ; CHECK-V8M-MAIN-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; CHECK-V8M-BASE-SIZE-LABEL: 'minmax'
@@ -272,11 +272,11 @@ define void @minmax() {
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
-; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s6 = select <4 x i1> %c6, <4 x ptr> undef, <4 x ptr> undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
+; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %s6 = select <4 x i1> %c6, <4 x ptr> undef, <4 x ptr> undef
 ; CHECK-V8M-BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; CHECK-V8R-SIZE-LABEL: 'minmax'
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
index 66240c8255ad7..7a40252dfa619 100644
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
@@ -236,17 +236,17 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 ;
 ; LATE-LABEL: 'fshl'
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fshl'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'fshl'
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll
index 12a1ae7a43610..cc8f2da57f072 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll
@@ -58,22 +58,22 @@ define i32 @abs(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'abs'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
index 7f1005285747c..01341e4dcb648 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll
@@ -60,22 +60,22 @@ define i32 @smin(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'smin'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
@@ -167,22 +167,22 @@ define i32 @smax(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'smax'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
@@ -275,22 +275,22 @@ define i32 @umin(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'umin'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
@@ -382,22 +382,22 @@ define i32 @sub(i32 %arg) {
 ;
 ; MVE-SIZE-LABEL: 'sub'
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll
index 4e2ccffa7cc5b..f626901990f01 100644
--- a/llvm/test/Analysis/CostModel/ARM/select.ll
+++ b/llvm/test/Analysis/CostModel/ARM/select.ll
@@ -155,21 +155,21 @@ define void @selects() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; CHECK-NEON-SIZE-LABEL: 'selects'
@@ -212,28 +212,28 @@ define void @selects() {
 ; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
 ; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
 ; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
 ; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; CHECK-THUMB2-SIZE-LABEL: 'selects'
@@ -244,28 +244,28 @@ define void @selects() {
 ; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
 ; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
 ; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
 ; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v0 = select i1 undef, i1 undef, i1 undef
diff --git a/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll b/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll
index 4f95da4f79c57..d9ccea55dd478 100644
--- a/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll
@@ -11,7 +11,7 @@
 define i32 @alias_with_different_offsets(ptr nocapture %A) {
 ; CHECK-LABEL: 'alias_with_different_offsets'
 ; CHECK-NEXT:  Src: store i32 2, ptr %arrayidx, align 1 --> Dst: store i32 2, ptr %arrayidx, align 1
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i32 2, ptr %arrayidx, align 1 --> Dst: %0 = load i32, ptr %A, align 1
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %A, align 1 --> Dst: %0 = load i32, ptr %A, align 1
@@ -207,11 +207,11 @@ end:
 ;        *((long long *)idx) = 1;
 ;      }
 ;
-; FIXME: There are loop-carried dependencies across iterations in the store.
+; There are loop-carried dependencies across iterations in the store.
 define void @multidim_accesses2(ptr %A) {
 ; CHECK-LABEL: 'multidim_accesses2'
 ; CHECK-NEXT:  Src: store i64 1, ptr %idx, align 4 --> Dst: store i64 1, ptr %idx, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   br label %for.i
diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
index c1f8c85f2bf0e..b498d70648bad 100644
--- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
@@ -40,6 +40,9 @@ define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) #0 align 2 {
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %v27 = load <32 x i32>, ptr %v25, align 256 --> Dst: %v27 = load <32 x i32>, ptr %v25, align 256
 ; CHECK-NEXT:    da analyze - consistent input [0 S S]!
+; CHECK-NEXT:    Runtime Assumptions:
+; CHECK-NEXT:    Equal predicate: (zext i7 (4 * (trunc i32 %v1 to i7) * (1 + (trunc i32 %n to i7))) to i32) == 0
+; CHECK-NEXT:    Equal predicate: (8 * (zext i4 (trunc i32 %v1 to i4) to i32))<nuw><nsw> == 0
 ; CHECK-NEXT:  Src: %v27 = load <32 x i32>, ptr %v25, align 256 --> Dst: %v32 = load <32 x i32>, ptr %v30, align 128
 ; CHECK-NEXT:    da analyze - input [* S S|<]!
 ; CHECK-NEXT:    Runtime Assumptions:
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index 247a105940e6e..fe140d01e8818 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -909,10 +909,10 @@ exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
 
-define i16 @not.crc.bad.cast(i8 %msg, i16 %checksum) {
-; CHECK-LABEL: 'not.crc.bad.cast'
+define i16 @not.crc.bad.endian.swapped.sb.check(i8 %msg, i16 %checksum) {
+; CHECK-LABEL: 'not.crc.bad.endian.swapped.sb.check'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Expected bottom 8 bits zero (????????00001011)
+; CHECK-NEXT:  Reason: Found stray unvisited instructions
 ;
 entry:
   br label %loop
@@ -1189,3 +1189,55 @@ loop:                                              ; preds = %loop, %entry
 exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
+
+define i16 @not.crc.stray.unvisited.call(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.stray.unvisited.call'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Found stray unvisited instructions
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 4129
+  %check.sb = icmp slt i16 %crc, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  call void @print(i16 %crc.next)
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+declare void @print(i16)
+
+define i16 @not.crc.call.sb.check(i16 %crc.init) {
+; CHECK-LABEL: 'not.crc.call.sb.check'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Found stray unvisited instructions
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ]
+  %crc.shl = shl i16 %crc, 1
+  %crc.xor = xor i16 %crc.shl, 4129
+  %call = call i16 @side.effect()
+  %check.sb = icmp slt i16 %call, 0
+  %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp samesign ult i32 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}
+
+declare i16 @side.effect()
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index bd7464577b7db..705c128a1b34f 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -197,6 +197,209 @@ bb:
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> %A, i1 false, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %A, i1 false, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> %A, i1 false, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
+  store <8 x bfloat> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x bfloat> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> %A, i1 false, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHRCK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 false, <16 x bfloat> %A, i1 false, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 false, <16 x bfloat> %A, i1 false, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false)
+  store <8 x bfloat> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 false, <16 x bfloat> %A, i1 false, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: DIVERGENT:   %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 false, <8 x i32> %A, i1 false, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false)
+define amdgpu_ps void @swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+  %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) %addr)
 define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 bb:
@@ -618,6 +821,35 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i16 %Index, i1, i1)
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 2a6135da9a61e..3426b6ff8d24d 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -97,6 +97,7 @@ set(LLVM_TEST_DEPENDS
           llvm-exegesis
           llvm-extract
           llvm-gsymutil
+          llvm-ir2vec
           llvm-isel-fuzzer
           llvm-ifs
           llvm-install-name-tool
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
new file mode 100644
index 0000000000000..09eb18b0e3574
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
@@ -0,0 +1,364 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            split_loads_to_fpr128
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSui %0, 0
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %1
+    %8:fpr128 = LD1i32 %7, 2, killed %2
+    %9:fpr128 = LD1i32 %8, 3, killed %3
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i16
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHroX %0, killed %1, 0, 1
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %2
+    %12:fpr128 = LD1i16 %11, 2, killed %3
+    %13:fpr128 = LD1i16 %12, 3, killed %4
+    %14:fpr128 = LD1i16 %13, 4, killed %5
+    %15:fpr128 = LD1i16 %14, 5, killed %6
+    %16:fpr128 = LD1i16 %15, 6, killed %7
+    %17:fpr128 = LD1i16 %16, 7, killed %8
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i16_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHui %0, 0
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %1
+    %12:fpr128 = LD1i16 %11, 2, killed %2
+    %13:fpr128 = LD1i16 %12, 3, killed %3
+    %14:fpr128 = LD1i16 %13, 4, killed %4
+    %15:fpr128 = LD1i16 %14, 5, killed %5
+    %16:fpr128 = LD1i16 %15, 6, killed %6
+    %17:fpr128 = LD1i16 %16, 7, killed %7
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i8
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i8
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16
+    ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]]
+    ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]]
+    ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]]
+    ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]]
+    ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:gpr64common = COPY $x9
+    %10:gpr64common = COPY $x10
+    %11:gpr64common = COPY $x11
+    %12:gpr64common = COPY $x12
+    %13:gpr64common = COPY $x13
+    %14:gpr64common = COPY $x14
+    %15:gpr64common = COPY $x15
+    %16:gpr64common = COPY $x16
+    %17:fpr8 = LDRBroX %0, killed %1, 0, 0
+    %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub
+    %19:fpr128 = LD1i8 %18, 1, killed %2
+    %20:fpr128 = LD1i8 %19, 2, killed %3
+    %21:fpr128 = LD1i8 %20, 3, killed %4
+    %22:fpr128 = LD1i8 %21, 4, killed %5
+    %23:fpr128 = LD1i8 %22, 5, killed %6
+    %24:fpr128 = LD1i8 %23, 6, killed %7
+    %25:fpr128 = LD1i8 %24, 7, killed %8
+    %26:fpr128 = LD1i8 %25, 8, killed %9
+    %27:fpr128 = LD1i8 %26, 9, killed %10
+    %28:fpr128 = LD1i8 %27, 10, killed %11
+    %29:fpr128 = LD1i8 %28, 11, killed %12
+    %30:fpr128 = LD1i8 %29, 12, killed %13
+    %31:fpr128 = LD1i8 %30, 13, killed %14
+    %32:fpr128 = LD1i8 %31, 14, killed %15
+    %33:fpr128 = LD1i8 %32, 15, killed %16
+    $q0 = COPY %33
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_missing_lanes
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: negative_pattern_missing_lanes
+    ; CHECK:      [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
+    ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
+  
+    %0:gpr64common = COPY $x0
+    %1:fpr128 = LDRQui $x1, 0
+    %2:fpr128 = LD1i32 %1, 3, %0
+    $q0 = COPY %2
+    RET_ReallyLR implicit $q0
+
+---
+name:            out_of_order_lanes
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: out_of_order_lanes
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 2, killed %2
+    %8:fpr128 = LD1i32 %7, 1, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_no_subreg_to_reg
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0
+    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]]
+    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]]
+    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:fpr128 = LDRQui %0, 0            
+    %5:fpr128 = LD1i32 %4, 1, killed %1 
+    %6:fpr128 = LD1i32 %5, 2, killed %2
+    %7:fpr128 = LD1i32 %6, 3, killed %3
+    $q0 = COPY %7
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_multiple_users
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: negative_pattern_multiple_users
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
+    ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    $q1 = COPY %8
+    RET_ReallyLR implicit $q0, implicit $q1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index ff28c7817d143..bae254bbd2104 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) {
 ; CHECK-LABEL: test_insert_v8f16_insert_1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    dup.8h v0, v0[0]
-; CHECK-NEXT:    mov.h v0[7], wzr
+; CHECK-NEXT:    mov.h v0[7], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <8 x half> <half undef, half undef, half undef, half undef, half undef, half undef, half undef, half 0.0>, half %a, i32 0
   %v.1 = insertelement <8 x half> %v.0, half %a, i32 1
@@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
 ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    dup.4s v0, v0[0]
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    mov.s v0[3], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %a, i32 0
   %v.1 = insertelement <4 x float> %v.0, float %a, i32 1
@@ -347,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) {
   ret <8 x i16> %v.0
 }
 
-; TODO: This should jsut be a mov.s v0[3], wzr
 define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
 ; CHECK-LABEL: test_insert_v4f16_f16_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov.h v0[0], wzr
+; CHECK-NEXT:    mov.h v0[0], v1[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
@@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
 define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
 ; CHECK-LABEL: test_insert_v8f16_f16_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.h v0[6], wzr
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.h v0[6], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
   ret <8 x half> %v.0
@@ -371,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
 define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
 ; CHECK-LABEL: test_insert_v2f32_f32_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov.s v0[0], wzr
+; CHECK-NEXT:    mov.s v0[0], v1[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
@@ -382,7 +386,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
 define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
 ; CHECK-LABEL: test_insert_v4f32_f32_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.s v0[3], v1[0]
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
   ret <4 x float> %v.0
@@ -391,8 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
 define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) {
 ; CHECK-LABEL: test_insert_v2f64_f64_zero:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.d v0[1], v1[0]
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
+  ret <2 x double> %v.0
+}
+
+define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov.h v0[0], wzr
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
+  ret <4 x half> %v.0
+}
+
+define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.h v0[6], wzr
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
+  ret <8 x half> %v.0
+}
+
+define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov.s v0[0], wzr
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
+  ret <2 x float> %v.0
+}
+
+define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    ret
+  %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
+  ret <4 x float> %v.0
+}
+
+define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 {
+; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr:
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov.d v0[1], xzr
 ; CHECK-NEXT:    ret
   %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
   ret <2 x double> %v.0
 }
+
+attributes #1 = {"tune-cpu"="cortex-a55"}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
new file mode 100644
index 0000000000000..9986af7eb231c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
@@ -0,0 +1,32 @@
+; This test checks if two similar functions, @0 and @1, are not merged as they are unnamed.
+
+; RUN: opt -mtriple=arm64-apple-darwin -S --passes=global-merge-func %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true < %s | FileCheck %s
+
+; CHECK-NOT: .Tgm
+
+@g = external local_unnamed_addr global [0 x i32], align 4
+@g1 = external global i32, align 4
+@g2 = external global i32, align 4
+
+define i32 @0(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
+
+define i32 @1(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index 7686740aec302..13434fabefa78 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c)
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-NEXT:    ldr s17, [sp, #40]
-; CHECK-NEXT:    add x10, sp, #56
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    ldr s17, [sp, #32]
+; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    add x10, sp, #64
 ; CHECK-NEXT:    mov v1.s[1], v3.s[0]
-; CHECK-NEXT:    ldr s3, [sp, #32]
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
-; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-NEXT:    ldr s16, [sp, #8]
 ; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #72
-; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT:    add x11, sp, #72
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    ldr s18, [x10]
+; CHECK-NEXT:    add x9, sp, #80
+; CHECK-NEXT:    add x10, sp, #56
 ; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT:    ldr s16, [sp, #8]
+; CHECK-NEXT:    ldr s3, [sp, #96]
+; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #88
 ; CHECK-NEXT:    ldr s2, [sp]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #112
-; CHECK-NEXT:    ldr s20, [sp, #136]
 ; CHECK-NEXT:    mov v1.s[2], v5.s[0]
-; CHECK-NEXT:    ld1 { v17.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    ldr s5, [sp, #96]
-; CHECK-NEXT:    ld1 { v3.s }[2], [x9]
+; CHECK-NEXT:    ldr s5, [sp, #40]
 ; CHECK-NEXT:    mov v0.s[2], v4.s[0]
-; CHECK-NEXT:    add x9, sp, #88
-; CHECK-NEXT:    ldr s4, [sp, #104]
-; CHECK-NEXT:    ldr s19, [sp, #192]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #80
-; CHECK-NEXT:    ld1 { v17.s }[3], [x9]
-; CHECK-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-NEXT:    add x9, sp, #120
-; CHECK-NEXT:    ld1 { v3.s }[3], [x10]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    ldr s7, [sp, #128]
+; CHECK-NEXT:    ldr s19, [x11]
 ; CHECK-NEXT:    add x10, sp, #144
+; CHECK-NEXT:    zip1 v4.2d, v17.2d, v18.2d
+; CHECK-NEXT:    add x11, sp, #160
+; CHECK-NEXT:    ldr s18, [sp, #136]
+; CHECK-NEXT:    ld1 { v19.s }[1], [x9]
 ; CHECK-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ldr s6, [sp, #128]
+; CHECK-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ldr s7, [sp, #104]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-NEXT:    zip1 v5.2d, v5.2d, v19.2d
+; CHECK-NEXT:    add x10, sp, #120
+; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    fmul v6.4s, v17.4s, v1.4s
-; CHECK-NEXT:    fmul v18.4s, v4.4s, v16.4s
-; CHECK-NEXT:    fmul v16.4s, v5.4s, v16.4s
-; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    add x10, sp, #208
-; CHECK-NEXT:    ld1 { v7.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    ld1 { v19.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v20.s }[1], [x9]
+; CHECK-NEXT:    ldr s17, [x11]
 ; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    add x10, sp, #184
-; CHECK-NEXT:    fneg v6.4s, v6.4s
-; CHECK-NEXT:    fneg v18.4s, v18.4s
-; CHECK-NEXT:    fmla v16.4s, v2.4s, v4.4s
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v17.4s
-; CHECK-NEXT:    ld1 { v7.s }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #168
-; CHECK-NEXT:    ld1 { v20.s }[2], [x9]
-; CHECK-NEXT:    ldr s4, [sp, #200]
+; CHECK-NEXT:    add x10, sp, #16
+; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    fmul v19.4s, v5.4s, v1.4s
+; CHECK-NEXT:    fmul v20.4s, v7.4s, v16.4s
+; CHECK-NEXT:    fmul v16.4s, v3.4s, v16.4s
+; CHECK-NEXT:    fmul v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
+; CHECK-NEXT:    ldr s21, [x11]
+; CHECK-NEXT:    zip1 v6.2d, v6.2d, v17.2d
+; CHECK-NEXT:    ldr s17, [sp, #192]
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    add x10, sp, #208
+; CHECK-NEXT:    ld1 { v21.s }[1], [x9]
 ; CHECK-NEXT:    add x9, sp, #216
-; CHECK-NEXT:    fmla v6.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmla v18.4s, v2.4s, v5.4s
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    fsub v0.4s, v7.4s, v1.4s
-; CHECK-NEXT:    fsub v1.4s, v19.4s, v16.4s
-; CHECK-NEXT:    ld1 { v20.s }[3], [x10]
-; CHECK-NEXT:    fadd v2.4s, v4.4s, v18.4s
-; CHECK-NEXT:    fadd v3.4s, v20.4s, v6.4s
+; CHECK-NEXT:    fneg v19.4s, v19.4s
+; CHECK-NEXT:    fneg v20.4s, v20.4s
+; CHECK-NEXT:    fmla v16.4s, v2.4s, v7.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v5.4s
+; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
+; CHECK-NEXT:    ldr s5, [sp, #200]
+; CHECK-NEXT:    zip1 v7.2d, v18.2d, v21.2d
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    fmla v19.4s, v0.4s, v4.4s
+; CHECK-NEXT:    fmla v20.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fsub v0.4s, v6.4s, v1.4s
+; CHECK-NEXT:    fsub v1.4s, v17.4s, v16.4s
+; CHECK-NEXT:    fadd v2.4s, v7.4s, v19.4s
+; CHECK-NEXT:    fadd v3.4s, v5.4s, v20.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT:    ext v5.16b, v3.16b, v2.16b, #12
-; CHECK-NEXT:    trn2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT:    ext v5.16b, v3.16b, v5.16b, #8
+; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
 ; CHECK-NEXT:    rev64 v4.4s, v4.4s
-; CHECK-NEXT:    trn2 v2.4s, v4.4s, v5.4s
-; CHECK-NEXT:    zip2 v4.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT:    mov v4.d[1], v2.d[0]
+; CHECK-NEXT:    trn2 v3.4s, v4.4s, v5.4s
+; CHECK-NEXT:    zip2 v4.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ext v1.16b, v3.16b, v1.16b, #8
+; CHECK-NEXT:    mov v4.d[1], v3.d[0]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    stp q4, q1, [x8, #16]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index acf15f1bd1178..e6f27b95d92c8 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
-; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
+; CHECK-NEXT:    ldr s1, [x2]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index c6b8e41f9bdfd..4906e2e15e51c 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    add x9, sp, #40
-; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    ldr h2, [x10]
 ; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    ldr h3, [sp, #72]
+; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
-; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #64
-; FULLFP16-NEXT:    str h2, [x8, #16]
+; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
@@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    add x9, sp, #40
-; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    ldr h2, [x10]
 ; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    ldr h3, [sp, #72]
+; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #64
-; FULLFP16-NEXT:    str h2, [x8, #16]
+; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 4c28c90824028..ae2ef2649102e 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
 ;
 ; CHECK-GI-LABEL: fshl_v7i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr s3, [sp, #48]
-; CHECK-GI-NEXT:    ldr s20, [sp, #56]
-; CHECK-GI-NEXT:    add x9, sp, #56
+; CHECK-GI-NEXT:    ldr s17, [sp, #48]
+; CHECK-GI-NEXT:    add x8, sp, #56
+; CHECK-GI-NEXT:    add x9, sp, #64
 ; CHECK-GI-NEXT:    ldr s4, [sp, #48]
-; CHECK-GI-NEXT:    ldr s7, [sp, #80]
-; CHECK-GI-NEXT:    mov w12, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    ldr s21, [sp, #88]
-; CHECK-GI-NEXT:    mov v3.s[1], v20.s[0]
-; CHECK-GI-NEXT:    fmov s20, w12
-; CHECK-GI-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-GI-NEXT:    ldr s17, [sp]
-; CHECK-GI-NEXT:    add x13, sp, #64
-; CHECK-GI-NEXT:    mov v7.s[1], v21.s[0]
+; CHECK-GI-NEXT:    ldr s21, [sp, #56]
+; CHECK-GI-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    ld1 { v17.s }[1], [x8]
+; CHECK-GI-NEXT:    ldr s20, [x9]
+; CHECK-GI-NEXT:    add x8, sp, #72
+; CHECK-GI-NEXT:    mov v4.s[1], v21.s[0]
 ; CHECK-GI-NEXT:    fmov s21, w7
+; CHECK-GI-NEXT:    ldr s6, [sp]
+; CHECK-GI-NEXT:    ld1 { v20.s }[1], [x8]
 ; CHECK-GI-NEXT:    ldr s19, [sp, #64]
-; CHECK-GI-NEXT:    mov w11, #31 // =0x1f
-; CHECK-GI-NEXT:    mov v20.s[1], w12
+; CHECK-GI-NEXT:    ldr s7, [sp, #80]
+; CHECK-GI-NEXT:    ldr s22, [sp, #88]
+; CHECK-GI-NEXT:    mov w9, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w11, #1 // =0x1
+; CHECK-GI-NEXT:    mov v21.s[1], v6.s[0]
+; CHECK-GI-NEXT:    fmov s6, w9
 ; CHECK-GI-NEXT:    ldr s18, [sp, #96]
-; CHECK-GI-NEXT:    ld1 { v4.s }[2], [x13]
-; CHECK-GI-NEXT:    mov w13, #1 // =0x1
-; CHECK-GI-NEXT:    mov v3.s[2], v19.s[0]
-; CHECK-GI-NEXT:    mov v21.s[1], v17.s[0]
-; CHECK-GI-NEXT:    fmov s17, w11
-; CHECK-GI-NEXT:    fmov s19, w13
+; CHECK-GI-NEXT:    zip1 v17.2d, v17.2d, v20.2d
+; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    mov v7.s[1], v22.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], v19.s[0]
+; CHECK-GI-NEXT:    fmov s19, w11
 ; CHECK-GI-NEXT:    fmov s23, w0
-; CHECK-GI-NEXT:    fmov s24, w11
-; CHECK-GI-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-NEXT:    mov v6.s[1], w9
+; CHECK-GI-NEXT:    fmov s24, w9
+; CHECK-GI-NEXT:    ldr s2, [sp, #8]
+; CHECK-GI-NEXT:    mov v20.s[1], w10
 ; CHECK-GI-NEXT:    ldr s0, [sp, #24]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #32]
+; CHECK-GI-NEXT:    mov v19.s[1], w11
 ; CHECK-GI-NEXT:    mov v7.s[2], v18.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], w11
-; CHECK-GI-NEXT:    mov v19.s[1], w13
-; CHECK-GI-NEXT:    mov v20.s[2], w12
 ; CHECK-GI-NEXT:    ldr s16, [sp, #72]
 ; CHECK-GI-NEXT:    mov v23.s[1], w1
 ; CHECK-GI-NEXT:    ldr s18, [sp, #80]
-; CHECK-GI-NEXT:    mov v21.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v24.s[1], w11
+; CHECK-GI-NEXT:    mov v21.s[2], v2.s[0]
+; CHECK-GI-NEXT:    mov v24.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
-; CHECK-GI-NEXT:    fmov s6, w4
-; CHECK-GI-NEXT:    add x10, sp, #88
+; CHECK-GI-NEXT:    fmov s5, w4
+; CHECK-GI-NEXT:    mov v20.s[2], w10
+; CHECK-GI-NEXT:    add x8, sp, #88
 ; CHECK-GI-NEXT:    movi v22.4s, #31
-; CHECK-GI-NEXT:    mov v3.s[3], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], w11
-; CHECK-GI-NEXT:    mov v19.s[2], w13
-; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    ldr s1, [sp, #40]
-; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x10]
-; CHECK-GI-NEXT:    eor v5.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT:    mov v4.s[3], v16.s[0]
+; CHECK-GI-NEXT:    mov v6.s[2], w9
+; CHECK-GI-NEXT:    mov v19.s[2], w11
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    ldr s3, [sp, #40]
+; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x8]
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    mov v6.s[1], w5
-; CHECK-GI-NEXT:    add x8, sp, #72
-; CHECK-GI-NEXT:    add x9, sp, #96
-; CHECK-GI-NEXT:    mov v21.s[3], v2.s[0]
-; CHECK-GI-NEXT:    mov v24.s[2], w11
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    ld1 { v4.s }[3], [x8]
-; CHECK-GI-NEXT:    bic v2.16b, v22.16b, v3.16b
-; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x9]
-; CHECK-GI-NEXT:    and v1.16b, v5.16b, v17.16b
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    add x8, sp, #96
+; CHECK-GI-NEXT:    eor v2.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT:    mov v21.s[3], v1.s[0]
+; CHECK-GI-NEXT:    mov v24.s[2], w9
+; CHECK-GI-NEXT:    mov v0.s[2], v3.s[0]
+; CHECK-GI-NEXT:    bic v1.16b, v22.16b, v4.16b
+; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x8]
 ; CHECK-GI-NEXT:    neg v3.4s, v19.4s
+; CHECK-GI-NEXT:    and v4.16b, v17.16b, v22.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v6.16b
 ; CHECK-GI-NEXT:    mov v23.s[3], w3
-; CHECK-GI-NEXT:    mov v6.s[2], w6
-; CHECK-GI-NEXT:    and v4.16b, v4.16b, v22.16b
-; CHECK-GI-NEXT:    ushr v5.4s, v21.4s, #1
-; CHECK-GI-NEXT:    neg v2.4s, v2.4s
-; CHECK-GI-NEXT:    and v7.16b, v18.16b, v24.16b
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    ushr v6.4s, v21.4s, #1
 ; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    and v7.16b, v18.16b, v24.16b
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
 ; CHECK-GI-NEXT:    ushl v3.4s, v23.4s, v4.4s
-; CHECK-GI-NEXT:    ushl v2.4s, v5.4s, v2.4s
-; CHECK-GI-NEXT:    ushl v4.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    ushl v1.4s, v6.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v4.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
+; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    mov s5, v0.s[1]
 ; CHECK-GI-NEXT:    mov s6, v0.s[2]
-; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
index 2213aa1429dbd..4e1876db772ed 100644
--- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
@@ -700,13 +700,14 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; CHECK-NEXT:    ldr s1, [sp, #44]
 ; CHECK-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
 ; CHECK-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-NEXT:    ld1 { v1.s }[2], [x20]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-NEXT:    ldr s0, [x20]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ld1 { v1.s }[3], [x21]
 ; CHECK-NEXT:    ldp x30, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 v1.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 ;
@@ -872,10 +873,11 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl frexpf
 ; CHECK-NEXT:    ldr s0, [sp, #28]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ld1 { v0.s }[2], [x20]
+; CHECK-NEXT:    ldr s1, [x20]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[3], [x21]
 ; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll
index 57f61e5303ecf..0bf4fb1063025 100644
--- a/llvm/test/CodeGen/AArch64/load-combine.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine.ll
@@ -713,3 +713,182 @@ define void @short_vector_to_i64(ptr %in, ptr %out, ptr %p) {
   store i64 %i3, ptr %out
   ret void
 }
+
+; x1 = x0
+define void @scalable_vector_to_i32(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+  %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+  %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+  %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+  %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+  %z0 = zext i8 %e1 to i32
+  %z1 = zext i8 %e2 to i32
+  %z2 = zext i8 %e3 to i32
+  %z3 = zext i8 %e4 to i32
+
+  %s1 = shl nuw nsw i32 %z1, 8
+  %s2 = shl nuw nsw i32 %z2, 16
+  %s3 = shl nuw i32 %z3, 24
+
+  %i1 = or i32 %s1, %z0
+  %i2 = or i32 %i1, %s2
+  %i3 = or i32 %i2, %s3
+
+  store i32 %i3, ptr %out
+  ret void
+}
+
+define void @scalable_vector_to_i32_unused_low_i8(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_low_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov w9, v0.s[2]
+; CHECK-NEXT:    mov w10, v0.s[3]
+; CHECK-NEXT:    lsl w8, w8, #8
+; CHECK-NEXT:    orr w8, w8, w9, lsl #16
+; CHECK-NEXT:    orr w8, w8, w10, lsl #24
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+  %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+  %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+  %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+  %z1 = zext i8 %e2 to i32
+  %z2 = zext i8 %e3 to i32
+  %z3 = zext i8 %e4 to i32
+
+  %s1 = shl nuw nsw i32 %z1, 8
+  %s2 = shl nuw nsw i32 %z2, 16
+  %s3 = shl nuw i32 %z3, 24
+
+  %i2 = or i32 %s1, %s2
+  %i3 = or i32 %i2, %s3
+
+  store i32 %i3, ptr %out
+  ret void
+}
+
+define void @scalable_vector_to_i32_unused_high_i8(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_high_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    mov w8, v0.s[2]
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+  %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+  %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+  %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+
+  %z0 = zext i8 %e1 to i32
+  %z1 = zext i8 %e2 to i32
+  %z2 = zext i8 %e3 to i32
+
+  %s1 = shl nuw nsw i32 %z1, 8
+  %s2 = shl nuw nsw i32 %z2, 16
+
+  %i1 = or i32 %s1, %z0
+  %i2 = or i32 %i1, %s2
+
+  store i32 %i2, ptr %out
+  ret void
+}
+
+define void @scalable_vector_to_i32_unused_low_i16(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_low_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    mov w8, v0.s[2]
+; CHECK-NEXT:    mov w9, v0.s[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    orr w8, w8, w9, lsl #24
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+  %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+  %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+  %z2 = zext i8 %e3 to i32
+  %z3 = zext i8 %e4 to i32
+
+  %s2 = shl nuw nsw i32 %z2, 16
+  %s3 = shl nuw i32 %z3, 24
+
+  %i3 = or i32 %s2, %s3
+
+  store i32 %i3, ptr %out
+  ret void
+}
+
+; x1 = x0[0:1]
+define void @scalable_vector_to_i32_unused_high_i16(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i32_unused_high_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+  %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+  %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+
+  %z0 = zext i8 %e1 to i32
+  %z1 = zext i8 %e2 to i32
+
+  %s1 = shl nuw nsw i32 %z1, 8
+
+  %i1 = or i32 %s1, %z0
+
+  store i32 %i1, ptr %out
+  ret void
+}
+
+; x1 = x0
+define void @scalable_vector_to_i64(ptr %in, ptr %out, ptr %p) #0 {
+; CHECK-LABEL: scalable_vector_to_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %ld = load <vscale x 4 x i8>, ptr %in, align 4
+
+  %e1 = extractelement <vscale x 4 x i8> %ld, i32 0
+  %e2 = extractelement <vscale x 4 x i8> %ld, i32 1
+  %e3 = extractelement <vscale x 4 x i8> %ld, i32 2
+  %e4 = extractelement <vscale x 4 x i8> %ld, i32 3
+
+  %z0 = zext i8 %e1 to i64
+  %z1 = zext i8 %e2 to i64
+  %z2 = zext i8 %e3 to i64
+  %z3 = zext i8 %e4 to i64
+
+  %s1 = shl nuw nsw i64 %z1, 8
+  %s2 = shl nuw nsw i64 %z2, 16
+  %s3 = shl nuw i64 %z3, 24
+
+  %i1 = or i64 %s1, %z0
+  %i2 = or i64 %i1, %s2
+  %i3 = or i64 %i2, %s3
+
+  store i64 %i3, ptr %out
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir b/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir
index 23811425101fd..b99ca25a5432d 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir
@@ -16,27 +16,53 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: BL @OUTLINED_FUNCTION_0, implicit-def $lr, implicit $sp, implicit-def $lr, implicit-def $x0, implicit-def $x1, implicit-def $x2, implicit-def $x3, implicit-def $x16, implicit $x0, implicit $sp
     ; CHECK-NEXT: $x9 = ADDXri $x16, 16, 0
-    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 0
+    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 1
     ; CHECK-NEXT: BL @OUTLINED_FUNCTION_0, implicit-def $lr, implicit $sp, implicit-def $lr, implicit-def $x0, implicit-def $x1, implicit-def $x2, implicit-def $x3, implicit-def $x16, implicit $x0, implicit $sp
     ; CHECK-NEXT: $x9 = ADDXri $x9, 16, 0
-    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 0
+    ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 2
     ; CHECK-NEXT: RET undef $x9
     $x0 = ADDXri $x0, 0, 0
     $x1 = ADDXri $x0, 1, 0
     $x2 = ADDXri $x0, 2, 0
     $x3 = ADDXri $x0, 3, 0
-
-    ; End safe range
     $x16 = ADDXri $x0, 16, 0
     $x9 = ADDXri $x16, 16, 0
-    $x16 = ADDXri killed $x16, 16, 0
-
+    $x16 = ADDXri killed $x16, 16, 1
+    ; End safe range
     $x0 = ADDXri $x0, 0, 0
     $x1 = ADDXri $x0, 1, 0
     $x2 = ADDXri $x0, 2, 0
     $x3 = ADDXri $x0, 3, 0
-    ; End safe range
     $x16 = ADDXri $x0, 16, 0
     $x9 = ADDXri $x9, 16, 0
-    $x16 = ADDXri killed $x16, 16, 0
+    $x16 = ADDXri killed $x16, 16, 2
+    ; End safe range
     RET undef $x9
+...
+---
+name:           unsafe_range_bundle
+tracksRegLiveness: true
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: unsafe_range_bundle
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x0 = ADDXri $x0, 0, 0
+    ; CHECK-NEXT: $x16 = ADDXri $x0, 16, 0
+    ; CHECK-NEXT: BUNDLE {
+    ; CHECK-NEXT:   $x16 = ADDXri killed $x16, 16, 3
+    ; CHECK-NEXT:   $x1 = ADDXri $x0, 0, 0
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: RET undef $x9
+    $x0 = ADDXri $x0, 0, 0
+    $x16 = ADDXri $x0, 16, 0
+    BUNDLE { ; Bundle crosses a safe range
+      $x16 = ADDXri killed $x16, 16, 3
+      ; End safe range
+      $x1 = ADDXri $x0, 0, 0
+    }
+    RET undef $x9
+...
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 4f0c4080aa0ce..9443004ea434b 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -6810,195 +6810,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
 ; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    .cfi_offset w29, -16
-; CHECK-SD-NEXT:    ldr b5, [sp, #208]
+; CHECK-SD-NEXT:    ldr b0, [sp, #208]
 ; CHECK-SD-NEXT:    add x8, sp, #216
-; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    add x9, sp, #272
+; CHECK-SD-NEXT:    ldr b2, [sp, #80]
 ; CHECK-SD-NEXT:    ldr b4, [sp, #976]
-; CHECK-SD-NEXT:    add x9, sp, #984
-; CHECK-SD-NEXT:    add x12, sp, #328
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #224
-; CHECK-SD-NEXT:    movi v1.16b, #1
-; CHECK-SD-NEXT:    mov v0.b[1], w1
-; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-SD-NEXT:    add x11, sp, #992
 ; CHECK-SD-NEXT:    ldr b6, [sp, #720]
-; CHECK-SD-NEXT:    ldr b7, [sp, #80]
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #224
+; CHECK-SD-NEXT:    fmov s16, w0
+; CHECK-SD-NEXT:    ldr b17, [sp, #848]
+; CHECK-SD-NEXT:    add x10, sp, #24
+; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT:    ld1 { v0.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #232
-; CHECK-SD-NEXT:    add x13, sp, #88
-; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x11]
-; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #856
-; CHECK-SD-NEXT:    mov v0.b[2], w2
-; CHECK-SD-NEXT:    add x14, sp, #1008
-; CHECK-SD-NEXT:    add x15, sp, #872
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT:    mov v16.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v0.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #240
-; CHECK-SD-NEXT:    add x16, sp, #888
-; CHECK-SD-NEXT:    add x10, sp, #16
-; CHECK-SD-NEXT:    add x9, sp, #24
-; CHECK-SD-NEXT:    add x11, sp, #40
-; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT:    mov v16.b[2], w2
+; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #248
-; CHECK-SD-NEXT:    mov v0.b[3], w3
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT:    mov v16.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v0.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #256
-; CHECK-SD-NEXT:    mov v0.b[4], w4
-; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #264
-; CHECK-SD-NEXT:    mov v0.b[5], w5
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #272
-; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x8]
+; CHECK-SD-NEXT:    mov v16.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b1, [x9]
 ; CHECK-SD-NEXT:    add x8, sp, #280
-; CHECK-SD-NEXT:    mov v0.b[6], w6
-; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #88
+; CHECK-SD-NEXT:    mov v16.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #288
-; CHECK-SD-NEXT:    mov v0.b[7], w7
-; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #296
-; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #128
-; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
+; CHECK-SD-NEXT:    mov v16.b[6], w6
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #304
-; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #136
-; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x8]
+; CHECK-SD-NEXT:    mov v16.b[7], w7
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #312
-; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #320
-; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #32
-; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #144
-; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #728
-; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #1000
-; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x11]
-; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #736
-; CHECK-SD-NEXT:    add x11, sp, #920
-; CHECK-SD-NEXT:    sdot v3.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT:    ldr b5, [sp, #848]
-; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #48
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #744
-; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #96
-; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x12]
-; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #864
-; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #1016
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #752
-; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #104
-; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #1024
-; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x14]
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #760
-; CHECK-SD-NEXT:    add x14, sp, #112
-; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #880
-; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #1032
-; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x14]
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x13]
-; CHECK-SD-NEXT:    add x14, sp, #768
-; CHECK-SD-NEXT:    add x13, sp, #120
-; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #1040
-; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x14]
-; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #776
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x16]
-; CHECK-SD-NEXT:    add x14, sp, #1048
-; CHECK-SD-NEXT:    ld1 { v4.b }[8], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #896
-; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x13]
-; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #784
-; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x15]
-; CHECK-SD-NEXT:    add x13, sp, #1056
-; CHECK-SD-NEXT:    ld1 { v4.b }[9], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #904
-; CHECK-SD-NEXT:    ld1 { v6.b }[8], [x10]
-; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #792
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x14]
-; CHECK-SD-NEXT:    add x10, sp, #1064
-; CHECK-SD-NEXT:    ld1 { v4.b }[10], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #912
-; CHECK-SD-NEXT:    ld1 { v6.b }[9], [x9]
-; CHECK-SD-NEXT:    ld1 { v7.b }[8], [x8]
-; CHECK-SD-NEXT:    add x9, sp, #800
-; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x13]
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #328
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #96
+; CHECK-SD-NEXT:    add x9, sp, #144
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #104
+; CHECK-SD-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    movi v1.16b, #1
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #112
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #120
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #128
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #136
+; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b3, [x9]
 ; CHECK-SD-NEXT:    add x8, sp, #152
-; CHECK-SD-NEXT:    ld1 { v4.b }[11], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1072
-; CHECK-SD-NEXT:    ld1 { v6.b }[10], [x9]
-; CHECK-SD-NEXT:    ld1 { v7.b }[9], [x8]
-; CHECK-SD-NEXT:    add x9, sp, #808
-; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x11]
-; CHECK-SD-NEXT:    add x8, sp, #56
-; CHECK-SD-NEXT:    ld1 { v4.b }[12], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #160
-; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[11], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #928
-; CHECK-SD-NEXT:    ld1 { v7.b }[10], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1080
-; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #984
+; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #160
+; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #168
+; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #176
+; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #184
+; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #192
+; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #200
+; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #992
+; CHECK-SD-NEXT:    add x9, sp, #1040
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1000
+; CHECK-SD-NEXT:    zip1 v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1008
+; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1016
+; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1024
+; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1032
+; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b5, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #1048
+; CHECK-SD-NEXT:    add x9, sp, #728
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1056
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1064
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1072
+; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1080
+; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1088
+; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1096
+; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #736
+; CHECK-SD-NEXT:    add x9, sp, #784
+; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #744
+; CHECK-SD-NEXT:    zip1 v4.2d, v4.2d, v5.2d
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #752
+; CHECK-SD-NEXT:    sdot v19.4s, v4.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v5.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #760
+; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #768
+; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #776
+; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b7, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #792
+; CHECK-SD-NEXT:    add x9, sp, #856
+; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #800
+; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #808
+; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #816
-; CHECK-SD-NEXT:    ld1 { v4.b }[13], [x10]
-; CHECK-SD-NEXT:    add x9, sp, #168
-; CHECK-SD-NEXT:    add x10, sp, #176
-; CHECK-SD-NEXT:    ld1 { v6.b }[12], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #936
-; CHECK-SD-NEXT:    ld1 { v7.b }[11], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #1088
-; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #64
-; CHECK-SD-NEXT:    ld1 { v4.b }[14], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #824
-; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[13], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #944
-; CHECK-SD-NEXT:    ld1 { v7.b }[12], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1096
-; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x9]
+; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #824
+; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #832
-; CHECK-SD-NEXT:    ld1 { v4.b }[15], [x10]
-; CHECK-SD-NEXT:    add x9, sp, #184
-; CHECK-SD-NEXT:    add x10, sp, #72
-; CHECK-SD-NEXT:    ld1 { v6.b }[14], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #952
-; CHECK-SD-NEXT:    ld1 { v7.b }[13], [x9]
-; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #840
-; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x10]
-; CHECK-SD-NEXT:    sdot v2.4s, v4.16b, v1.16b
-; CHECK-SD-NEXT:    add x9, sp, #192
-; CHECK-SD-NEXT:    ld1 { v6.b }[15], [x8]
+; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v17.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #864
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    ld1 { v16.b }[8], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #912
+; CHECK-SD-NEXT:    ld1 { v17.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #872
+; CHECK-SD-NEXT:    zip1 v0.2d, v6.2d, v7.2d
+; CHECK-SD-NEXT:    ld1 { v16.b }[9], [x10]
+; CHECK-SD-NEXT:    ld1 { v17.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #880
+; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v17.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #888
+; CHECK-SD-NEXT:    ld1 { v17.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #896
+; CHECK-SD-NEXT:    ld1 { v17.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #904
+; CHECK-SD-NEXT:    ld1 { v17.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b18, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #920
+; CHECK-SD-NEXT:    ld1 { v18.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    ld1 { v16.b }[10], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #928
+; CHECK-SD-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #40
+; CHECK-SD-NEXT:    ld1 { v16.b }[11], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #936
+; CHECK-SD-NEXT:    ld1 { v18.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #48
+; CHECK-SD-NEXT:    ld1 { v16.b }[12], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #944
+; CHECK-SD-NEXT:    ld1 { v18.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    ld1 { v16.b }[13], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #952
+; CHECK-SD-NEXT:    ld1 { v18.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #64
+; CHECK-SD-NEXT:    ld1 { v16.b }[14], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #960
-; CHECK-SD-NEXT:    ld1 { v7.b }[14], [x9]
-; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    add x8, sp, #200
-; CHECK-SD-NEXT:    add x9, sp, #968
-; CHECK-SD-NEXT:    sdot v2.4s, v6.16b, v1.16b
-; CHECK-SD-NEXT:    ld1 { v7.b }[15], [x8]
-; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x9]
-; CHECK-SD-NEXT:    sdot v3.4s, v7.16b, v1.16b
-; CHECK-SD-NEXT:    sdot v2.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT:    add v0.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #72
+; CHECK-SD-NEXT:    ld1 { v16.b }[15], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #968
+; CHECK-SD-NEXT:    ld1 { v18.b }[7], [x8]
+; CHECK-SD-NEXT:    sdot v5.4s, v16.16b, v1.16b
+; CHECK-SD-NEXT:    zip1 v0.2d, v17.2d, v18.2d
+; CHECK-SD-NEXT:    sdot v5.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    add v0.4s, v5.4s, v19.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index f8ba150a0405f..f7a87ae340a73 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v17f32:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
+; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-BE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-BE-NEXT:    ldr s16, [sp, #36]
+; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-BE-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BE-NEXT:    ldr s17, [sp, #4]
-; CHECK-BE-NEXT:    add x8, sp, #44
-; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT:    add x8, sp, #12
+; CHECK-BE-NEXT:    add x9, sp, #20
+; CHECK-BE-NEXT:    ldr s16, [sp, #36]
 ; CHECK-BE-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-BE-NEXT:    ldr s1, [sp, #4]
+; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT:    add x10, sp, #52
 ; CHECK-BE-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-BE-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-BE-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-BE-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-BE-NEXT:    ldr s1, [sp, #68]
-; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #12
-; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #52
-; CHECK-BE-NEXT:    str s1, [x0, #64]
-; CHECK-BE-NEXT:    ld1 { v16.s }[2], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #20
+; CHECK-BE-NEXT:    ld1 { v1.s }[1], [x8]
+; CHECK-BE-NEXT:    ldr s5, [x9]
+; CHECK-BE-NEXT:    add x8, sp, #28
+; CHECK-BE-NEXT:    add x9, sp, #44
+; CHECK-BE-NEXT:    ld1 { v5.s }[1], [x8]
+; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x9]
+; CHECK-BE-NEXT:    ldr s17, [x10]
+; CHECK-BE-NEXT:    add x8, sp, #60
 ; CHECK-BE-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-BE-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-BE-NEXT:    ld1 { v17.s }[2], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #60
-; CHECK-BE-NEXT:    ld1 { v16.s }[3], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #28
-; CHECK-BE-NEXT:    ld1 { v17.s }[3], [x8]
+; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
+; CHECK-BE-NEXT:    ldr s2, [sp, #68]
+; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    zip1 v1.2d, v1.2d, v5.2d
+; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    str s2, [x0, #64]
+; CHECK-BE-NEXT:    zip1 v5.2d, v16.2d, v17.2d
 ; CHECK-BE-NEXT:    mov v4.s[3], v7.s[0]
-; CHECK-BE-NEXT:    add x8, x0, #48
 ; CHECK-BE-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-BE-NEXT:    st1 { v16.4s }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #32
-; CHECK-BE-NEXT:    st1 { v17.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x0]
 ; CHECK-BE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/pr148949.ll b/llvm/test/CodeGen/AArch64/pr148949.ll
new file mode 100644
index 0000000000000..7dd9e8f86f0cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr148949.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve | FileCheck %s
+
+define <vscale x 4 x i8> @widget(i1 %arg, <vscale x 4 x i1> %arg1, <vscale x 4 x i8> %arg2, <vscale x 4 x i8> %arg3) {
+; CHECK-LABEL: widget:
+; CHECK:       // %bb.0: // %bb
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p1.s, xzr, x8
+; CHECK-NEXT:    mov z1.s, p1/z, #1 // =0x1
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+bb:
+  %insertelement = insertelement <vscale x 4 x i1> zeroinitializer, i1 %arg, i64 0
+  %shufflevector = shufflevector <vscale x 4 x i1> %insertelement, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> zeroinitializer
+  %xor = xor <vscale x 4 x i1> %shufflevector, splat (i1 true)
+  %zext = zext <vscale x 4 x i1> %xor to <vscale x 4 x i8>
+  %select = select <vscale x 4 x i1> %arg1, <vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> %arg2
+  %or = or <vscale x 4 x i8> %select, %zext
+  ret <vscale x 4 x i8> %or
+}
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1376f5d9a380d..b124042265d40 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -19,8 +19,13 @@ define i8 @si8_7(i8 %a, i8 %b) {
 ; CHECK-GI-LABEL: si8_7:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    mov w9, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #-109 // =0xffffff93
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    add w8, w0, w8, asr #8
+; CHECK-GI-NEXT:    sbfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -45,8 +50,14 @@ define i8 @si8_100(i8 %a, i8 %b) {
 ; CHECK-GI-LABEL: si8_100:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #41 // =0x29
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    sbfx w8, w8, #8, #8
+; CHECK-GI-NEXT:    asr w8, w8, #4
+; CHECK-GI-NEXT:    ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -129,8 +140,12 @@ define i16 @si16_7(i16 %a, i16 %b) {
 ; CHECK-GI-LABEL: si16_7:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxth w8, w0
-; CHECK-GI-NEXT:    mov w9, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    asr w8, w8, #16
+; CHECK-GI-NEXT:    asr w8, w8, #1
+; CHECK-GI-NEXT:    ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -155,8 +170,13 @@ define i16 @si16_100(i16 %a, i16 %b) {
 ; CHECK-GI-LABEL: si16_100:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    asr w8, w8, #16
+; CHECK-GI-NEXT:    asr w8, w8, #3
+; CHECK-GI-NEXT:    ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -240,8 +260,13 @@ define i32 @si32_7(i32 %a, i32 %b) {
 ;
 ; CHECK-GI-LABEL: si32_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w0, w8
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movk w8, #37449, lsl #16
+; CHECK-GI-NEXT:    smull x8, w0, w8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    add w8, w8, w0
+; CHECK-GI-NEXT:    asr w8, w8, #2
+; CHECK-GI-NEXT:    add w8, w8, w8, lsr #31
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -265,9 +290,14 @@ define i32 @si32_100(i32 %a, i32 %b) {
 ;
 ; CHECK-GI-LABEL: si32_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w9, w0, w8
-; CHECK-GI-NEXT:    msub w0, w9, w8, w0
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    smull x8, w0, w8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    asr w8, w8, #5
+; CHECK-GI-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem i32 %a, 100
@@ -348,8 +378,13 @@ define i64 @si64_7(i64 %a, i64 %b) {
 ;
 ; CHECK-GI-LABEL: si64_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv x8, x0, x8
+; CHECK-GI-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #48
+; CHECK-GI-NEXT:    smulh x8, x0, x8
+; CHECK-GI-NEXT:    asr x8, x8, #1
+; CHECK-GI-NEXT:    add x8, x8, x8, lsr #63
 ; CHECK-GI-NEXT:    lsl x9, x8, #3
 ; CHECK-GI-NEXT:    sub x8, x9, x8
 ; CHECK-GI-NEXT:    sub x0, x0, x8
@@ -376,9 +411,16 @@ define i64 @si64_100(i64 %a, i64 %b) {
 ;
 ; CHECK-GI-LABEL: si64_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv x9, x0, x8
-; CHECK-GI-NEXT:    msub x0, x9, x8, x0
+; CHECK-GI-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk x8, #28835, lsl #16
+; CHECK-GI-NEXT:    movk x8, #2621, lsl #32
+; CHECK-GI-NEXT:    movk x8, #41943, lsl #48
+; CHECK-GI-NEXT:    smulh x8, x0, x8
+; CHECK-GI-NEXT:    add x8, x8, x0
+; CHECK-GI-NEXT:    asr x8, x8, #6
+; CHECK-GI-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-GI-NEXT:    msub x0, x8, x9, x0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem i64 %a, 100
@@ -644,25 +686,49 @@ define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov w8, #65427 // =0xff93
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    smov w11, v1.h[1]
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    smov w8, v1.h[0]
+; CHECK-GI-NEXT:    smov w9, v1.h[1]
+; CHECK-GI-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    movi v3.2s, #7
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w10, v1.b[1]
+; CHECK-GI-NEXT:    umov w9, v2.b[0]
+; CHECK-GI-NEXT:    umov w11, v2.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s1, w10
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i8> %d, <i8 7, i8 7>
@@ -687,25 +753,46 @@ define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    smov w11, v1.h[1]
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    smov w8, v1.h[0]
+; CHECK-GI-NEXT:    smov w9, v1.h[1]
+; CHECK-GI-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    movi v3.2s, #100
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w10, v1.b[1]
+; CHECK-GI-NEXT:    umov w9, v2.b[0]
+; CHECK-GI-NEXT:    umov w11, v2.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s1, w10
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i8> %d, <i8 100, i8 100>
@@ -872,30 +959,37 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v3.4h, #7
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    mov v2.h[1], w8
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w9, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w9
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov w8, #147 // =0x93
+; CHECK-GI-NEXT:    shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    mov v4.b[1], w9
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v4.b[2], w9
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov v4.b[3], w9
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ssra v2.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
+; CHECK-GI-NEXT:    uzp1 v1.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    dup v3.4h, w9
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    neg v2.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    add v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
@@ -943,30 +1037,37 @@ define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
+; CHECK-GI-NEXT:    shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    mov v4.b[1], w9
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v4.b[2], w9
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    mov v4.b[3], w9
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    mov v3.b[3], w8
 ; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v3.4h, #100
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    mov v2.h[1], w8
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w9, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w9
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    dup v3.4h, w8
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    neg v2.8b, v4.8b
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    add v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    mls v0.4h, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
@@ -988,42 +1089,15 @@ define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v4.8b, #7
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    movi v1.8b, #147
+; CHECK-GI-NEXT:    movi v3.8b, #7
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    add v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    sshr v2.8b, v1.8b, #2
+; CHECK-GI-NEXT:    ushr v2.8b, v2.8b, #7
+; CHECK-GI-NEXT:    ssra v2.8b, v1.8b, #2
+; CHECK-GI-NEXT:    mls v0.8b, v2.8b, v3.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -1044,42 +1118,14 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v4.8b, #100
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v5.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v5.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    movi v1.8b, #41
+; CHECK-GI-NEXT:    movi v3.8b, #100
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT:    sshr v2.8b, v1.8b, #4
+; CHECK-GI-NEXT:    ushr v2.8b, v2.8b, #7
+; CHECK-GI-NEXT:    ssra v2.8b, v1.8b, #4
+; CHECK-GI-NEXT:    mls v0.8b, v2.8b, v3.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1102,72 +1148,16 @@ define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv16i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v16.8b, #7
-; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s2
-; CHECK-GI-NEXT:    fmov w17, s0
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    mov w14, v2.s[1]
-; CHECK-GI-NEXT:    mov w18, v0.s[1]
-; CHECK-GI-NEXT:    mov w3, v3.s[1]
-; CHECK-GI-NEXT:    mov w15, v2.s[2]
-; CHECK-GI-NEXT:    mov w0, v0.s[2]
-; CHECK-GI-NEXT:    sdiv w11, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[1]
-; CHECK-GI-NEXT:    mov w4, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v2.s[3]
-; CHECK-GI-NEXT:    mov w1, v0.s[3]
-; CHECK-GI-NEXT:    mov w5, v3.s[3]
-; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    sdiv w17, w17, w8
-; CHECK-GI-NEXT:    fmov s5, w13
-; CHECK-GI-NEXT:    sdiv w2, w2, w8
-; CHECK-GI-NEXT:    fmov s6, w17
-; CHECK-GI-NEXT:    sdiv w12, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[2]
-; CHECK-GI-NEXT:    fmov s7, w2
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v4.s[1], w12
-; CHECK-GI-NEXT:    sdiv w18, w18, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w14
-; CHECK-GI-NEXT:    sdiv w3, w3, w8
-; CHECK-GI-NEXT:    mov v6.s[1], w18
-; CHECK-GI-NEXT:    sdiv w10, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[3]
-; CHECK-GI-NEXT:    mov v7.s[1], w3
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v4.s[2], w10
-; CHECK-GI-NEXT:    sdiv w0, w0, w8
-; CHECK-GI-NEXT:    mov v5.s[2], w15
-; CHECK-GI-NEXT:    sdiv w4, w4, w8
-; CHECK-GI-NEXT:    mov v6.s[2], w0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    mov v7.s[2], w4
-; CHECK-GI-NEXT:    sdiv w16, w16, w8
-; CHECK-GI-NEXT:    mov v4.s[3], w9
-; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    sdiv w1, w1, w8
-; CHECK-GI-NEXT:    mov v5.s[3], w16
-; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
-; CHECK-GI-NEXT:    sdiv w8, w5, w8
-; CHECK-GI-NEXT:    mov v6.s[3], w1
-; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
-; CHECK-GI-NEXT:    mov v7.s[3], w8
-; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    movi v1.16b, #147
+; CHECK-GI-NEXT:    movi v3.16b, #7
+; CHECK-GI-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    sshr v2.16b, v1.16b, #2
+; CHECK-GI-NEXT:    ushr v2.16b, v2.16b, #7
+; CHECK-GI-NEXT:    ssra v2.16b, v1.16b, #2
+; CHECK-GI-NEXT:    mls v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -1189,72 +1179,15 @@ define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv16i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v3.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v16.8b, #100
-; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s2
-; CHECK-GI-NEXT:    fmov w17, s0
-; CHECK-GI-NEXT:    fmov w2, s3
-; CHECK-GI-NEXT:    mov w14, v2.s[1]
-; CHECK-GI-NEXT:    mov w18, v0.s[1]
-; CHECK-GI-NEXT:    mov w3, v3.s[1]
-; CHECK-GI-NEXT:    mov w15, v2.s[2]
-; CHECK-GI-NEXT:    mov w0, v0.s[2]
-; CHECK-GI-NEXT:    sdiv w11, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[1]
-; CHECK-GI-NEXT:    mov w4, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v2.s[3]
-; CHECK-GI-NEXT:    mov w1, v0.s[3]
-; CHECK-GI-NEXT:    mov w5, v3.s[3]
-; CHECK-GI-NEXT:    sshll v17.4s, v16.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    sdiv w17, w17, w8
-; CHECK-GI-NEXT:    fmov s5, w13
-; CHECK-GI-NEXT:    sdiv w2, w2, w8
-; CHECK-GI-NEXT:    fmov s6, w17
-; CHECK-GI-NEXT:    sdiv w12, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[2]
-; CHECK-GI-NEXT:    fmov s7, w2
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v4.s[1], w12
-; CHECK-GI-NEXT:    sdiv w18, w18, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w14
-; CHECK-GI-NEXT:    sdiv w3, w3, w8
-; CHECK-GI-NEXT:    mov v6.s[1], w18
-; CHECK-GI-NEXT:    sdiv w10, w9, w8
-; CHECK-GI-NEXT:    mov w9, v1.s[3]
-; CHECK-GI-NEXT:    mov v7.s[1], w3
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v4.s[2], w10
-; CHECK-GI-NEXT:    sdiv w0, w0, w8
-; CHECK-GI-NEXT:    mov v5.s[2], w15
-; CHECK-GI-NEXT:    sdiv w4, w4, w8
-; CHECK-GI-NEXT:    mov v6.s[2], w0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    mov v7.s[2], w4
-; CHECK-GI-NEXT:    sdiv w16, w16, w8
-; CHECK-GI-NEXT:    mov v4.s[3], w9
-; CHECK-GI-NEXT:    mls v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    sdiv w1, w1, w8
-; CHECK-GI-NEXT:    mov v5.s[3], w16
-; CHECK-GI-NEXT:    mls v2.4s, v5.4s, v16.4s
-; CHECK-GI-NEXT:    sdiv w8, w5, w8
-; CHECK-GI-NEXT:    mov v6.s[3], w1
-; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    mls v0.4s, v6.4s, v17.4s
-; CHECK-GI-NEXT:    mov v7.s[3], w8
-; CHECK-GI-NEXT:    mls v3.4s, v7.4s, v16.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    movi v1.16b, #41
+; CHECK-GI-NEXT:    movi v3.16b, #100
+; CHECK-GI-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    sshr v2.16b, v1.16b, #4
+; CHECK-GI-NEXT:    ushr v2.16b, v2.16b, #7
+; CHECK-GI-NEXT:    ssra v2.16b, v1.16b, #4
+; CHECK-GI-NEXT:    mls v0.16b, v2.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1754,20 +1687,31 @@ define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i16_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT:    shl v2.2s, v0.2s, #16
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    sshr v2.2s, v2.2s, #16
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    dup v3.2s, w8
+; CHECK-GI-NEXT:    ushl v2.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i16> %d, <i16 7, i16 7>
@@ -1792,20 +1736,31 @@ define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i16_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    mov w8, #5243 // =0x147b
+; CHECK-GI-NEXT:    shl v2.2s, v0.2s, #16
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT:    sshr v2.2s, v2.2s, #16
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
+; CHECK-GI-NEXT:    mov w8, #3 // =0x3
 ; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov w8, #100 // =0x64
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    neg v2.4h, v3.4h
+; CHECK-GI-NEXT:    dup v3.2s, w8
+; CHECK-GI-NEXT:    ushl v2.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i16> %d, <i16 100, i16 100>
@@ -1949,24 +1904,15 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i16_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v2.4h, #7
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI44_0
+; CHECK-GI-NEXT:    movi v3.4h, #7
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI44_0]
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    sshr v2.4h, v1.4h, #1
+; CHECK-GI-NEXT:    ushr v2.4h, v2.4h, #15
+; CHECK-GI-NEXT:    ssra v2.4h, v1.4h, #1
+; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
@@ -1988,24 +1934,15 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i16_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v2.4h, #100
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI45_0
+; CHECK-GI-NEXT:    movi v3.4h, #100
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI45_0]
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT:    sshr v2.4h, v1.4h, #3
+; CHECK-GI-NEXT:    ushr v2.4h, v2.4h, #15
+; CHECK-GI-NEXT:    ssra v2.4h, v1.4h, #3
+; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v3.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
@@ -2028,38 +1965,16 @@ define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i16_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v4.4h, #7
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    adrp x8, .LCPI46_0
+; CHECK-GI-NEXT:    movi v3.8h, #7
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI46_0]
+; CHECK-GI-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sshr v2.8h, v1.8h, #1
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT:    ssra v2.8h, v1.8h, #1
+; CHECK-GI-NEXT:    mls v0.8h, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
@@ -2082,38 +1997,16 @@ define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) {
 ;
 ; CHECK-GI-LABEL: sv8i16_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    movi v4.4h, #100
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w13, s0
-; CHECK-GI-NEXT:    mov w10, v1.s[1]
-; CHECK-GI-NEXT:    mov w14, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v1.s[2]
-; CHECK-GI-NEXT:    mov w15, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v1.s[3]
-; CHECK-GI-NEXT:    mov w16, v0.s[3]
-; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w13, w13, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    sdiv w14, w14, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w10
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v3.s[1], w14
-; CHECK-GI-NEXT:    sdiv w15, w15, w8
-; CHECK-GI-NEXT:    mov v2.s[2], w11
-; CHECK-GI-NEXT:    sdiv w12, w12, w8
-; CHECK-GI-NEXT:    mov v3.s[2], w15
-; CHECK-GI-NEXT:    sdiv w8, w16, w8
-; CHECK-GI-NEXT:    mov v2.s[3], w12
-; CHECK-GI-NEXT:    mls v1.4s, v2.4s, v4.4s
-; CHECK-GI-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    adrp x8, .LCPI47_0
+; CHECK-GI-NEXT:    movi v3.8h, #100
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI47_0]
+; CHECK-GI-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    sshr v2.8h, v1.8h, #3
+; CHECK-GI-NEXT:    ushr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT:    ssra v2.8h, v1.8h, #3
+; CHECK-GI-NEXT:    mls v0.8h, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
@@ -2499,17 +2392,16 @@ define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i32_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    movi v2.2s, #7
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    adrp x8, .LCPI56_0
+; CHECK-GI-NEXT:    movi v3.2s, #7
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI56_0]
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    sshr v2.2s, v1.2s, #2
+; CHECK-GI-NEXT:    ushr v2.2s, v2.2s, #31
+; CHECK-GI-NEXT:    ssra v2.2s, v1.2s, #2
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i32> %d, <i32 7, i32 7>
@@ -2532,17 +2424,15 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i32_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    movi v2.2s, #100
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    adrp x8, .LCPI57_0
+; CHECK-GI-NEXT:    movi v3.2s, #100
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI57_0]
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT:    sshr v2.2s, v1.2s, #5
+; CHECK-GI-NEXT:    ushr v2.2s, v2.2s, #31
+; CHECK-GI-NEXT:    ssra v2.2s, v1.2s, #5
+; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i32> %d, <i32 100, i32 100>
@@ -2664,21 +2554,17 @@ define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i32_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    movi v2.4s, #7
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI60_0
+; CHECK-GI-NEXT:    movi v3.4s, #7
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshr v2.4s, v1.4s, #2
+; CHECK-GI-NEXT:    ushr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT:    ssra v2.4s, v1.4s, #2
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
@@ -2702,21 +2588,16 @@ define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i32_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    movi v2.4s, #100
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w8, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI61_0
+; CHECK-GI-NEXT:    movi v3.4s, #100
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    sshr v2.4s, v1.4s, #5
+; CHECK-GI-NEXT:    ushr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT:    ssra v2.4s, v1.4s, #5
+; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
new file mode 100644
index 0000000000000..b647daf72ca35
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
+
+
+define <2 x i16> @saturating_2xi16(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: saturating_2xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i16> %a to <2 x i32>
+  %bs = sext <2 x i16> %b to <2 x i32>
+  %m = mul <2 x i32> %bs, %as
+  %sh = ashr <2 x i32> %m, splat (i32 15)
+  %ma = tail call <2 x i32> @llvm.smin.v4i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
+  %t = trunc <2 x i32> %ma to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define <4 x i16> @saturating_4xi16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: saturating_4xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 15)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <8 x i16> @saturating_8xi16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: saturating_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    ret
+  %as = sext <8 x i16> %a to <8 x i32>
+  %bs = sext <8 x i16> %b to <8 x i32>
+  %m = mul <8 x i32> %bs, %as
+  %sh = ashr <8 x i32> %m, splat (i32 15)
+  %ma = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 32767))
+  %t = trunc <8 x i32> %ma to <8 x i16>
+  ret <8 x i16> %t
+}
+
+define <2 x i32> @saturating_2xi32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: saturating_2xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i32> %a to <2 x i64>
+  %bs = sext <2 x i32> %b to <2 x i64>
+  %m = mul <2 x i64> %bs, %as
+  %sh = ashr <2 x i64> %m, splat (i64 31)
+  %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
+  %t = trunc <2 x i64> %ma to <2 x i32>
+  ret <2 x i32> %t
+}
+
+define <4 x i32> @saturating_4xi32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: saturating_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i32> %a to <4 x i64>
+  %bs = sext <4 x i32> %b to <4 x i64>
+  %m = mul <4 x i64> %bs, %as
+  %sh = ashr <4 x i64> %m, splat (i64 31)
+  %ma = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %sh, <4 x i64> splat (i64 2147483647))
+  %t = trunc <4 x i64> %ma to <4 x i32>
+  ret <4 x i32> %t
+}
+
+define <8 x i32> @saturating_8xi32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: saturating_8xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    sqdmulh v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <8 x i32> %a to <8 x i64>
+  %bs = sext <8 x i32> %b to <8 x i64>
+  %m = mul <8 x i64> %bs, %as
+  %sh = ashr <8 x i64> %m, splat (i64 31)
+  %ma = tail call <8 x i64> @llvm.smin.v8i64(<8 x i64> %sh, <8 x i64> splat (i64 2147483647))
+  %t = trunc <8 x i64> %ma to <8 x i32>
+  ret <8 x i32> %t
+}
+
+define <2 x i64> @saturating_2xi32_2xi64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: saturating_2xi32_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
+  %as = sext <2 x i32> %a to <2 x i64>
+  %bs = sext <2 x i32> %b to <2 x i64>
+  %m = mul <2 x i64> %bs, %as
+  %sh = ashr <2 x i64> %m, splat (i64 31)
+  %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
+  ret <2 x i64> %ma
+}
+
+define <6 x i16> @saturating_6xi16(<6 x i16> %a, <6 x i16> %b) {
+; CHECK-LABEL: saturating_6xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull2 v3.4s, v1.8h, v0.8h
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    sqdmulh v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    sshr v3.4s, v3.4s, #15
+; CHECK-NEXT:    smin v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    xtn2 v0.8h, v2.4s
+; CHECK-NEXT:    ret
+  %as = sext <6 x i16> %a to <6 x i32>
+  %bs = sext <6 x i16> %b to <6 x i32>
+  %m = mul <6 x i32> %bs, %as
+  %sh = ashr <6 x i32> %m, splat (i32 15)
+  %ma = tail call <6 x i32> @llvm.smin.v6i32(<6 x i32> %sh, <6 x i32> splat (i32 32767))
+  %t = trunc <6 x i32> %ma to <6 x i16>
+  ret <6 x i16> %t
+}
+
+define <4 x i16> @unsupported_saturation_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_saturation_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #15
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 15)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 42))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <4 x i16> @unsupported_shift_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_shift_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #3
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 3)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <2 x i16> @extend_to_illegal_type(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: extend_to_illegal_type:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    sqdmulh v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i16> %a to <2 x i48>
+  %bs = sext <2 x i16> %b to <2 x i48>
+  %m = mul <2 x i48> %bs, %as
+  %sh = ashr <2 x i48> %m, splat (i48 15)
+  %ma = tail call <2 x i48> @llvm.smin.v4i32(<2 x i48> %sh, <2 x i48> splat (i48 32767))
+  %t = trunc <2 x i48> %ma to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define <2 x i11> @illegal_source(<2 x i11> %a, <2 x i11> %b) {
+; CHECK-LABEL: illegal_source:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #21
+; CHECK-NEXT:    shl v1.2s, v1.2s, #21
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #21
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #21
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    movi v1.2s, #127, msl #8
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #15
+; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i11> %a to <2 x i32>
+  %bs = sext <2 x i11> %b to <2 x i32>
+  %m = mul <2 x i32> %bs, %as
+  %sh = ashr <2 x i32> %m, splat (i32 15)
+  %ma = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
+  %t = trunc <2 x i32> %ma to <2 x i11>
+  ret <2 x i11> %t
+}
+define <1 x i16> @saturating_1xi16(<1 x i16> %a, <1 x i16> %b) {
+; CHECK-LABEL: saturating_1xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    zip1 v1.4h, v1.4h, v0.4h
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    movi v1.2s, #127, msl #8
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #15
+; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %as = sext <1 x i16> %a to <1 x i32>
+  %bs = sext <1 x i16> %b to <1 x i32>
+  %m = mul <1 x i32> %bs, %as
+  %sh = ashr <1 x i32> %m, splat (i32 15)
+  %ma = tail call <1 x i32> @llvm.smin.v1i32(<1 x i32> %sh, <1 x i32> splat (i32 32767))
+  %t = trunc <1 x i32> %ma to <1 x i16>
+  ret <1 x i16> %t
+}
diff --git a/llvm/test/CodeGen/AArch64/spill-fold.mir b/llvm/test/CodeGen/AArch64/spill-fold.mir
index 0149e4504bed2..9ea9ce53b68a8 100644
--- a/llvm/test/CodeGen/AArch64/spill-fold.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fold.mir
@@ -10,6 +10,7 @@
   define i64 @test_subreg_fill_fold() { ret i64 0 }
   define double @test_subreg_fill_fold2() { ret double 0.0 }
   define <4 x float> @test_subreg_fill_fold3() { ret <4 x float> undef }
+  define i64 @test_subreg_fill_fold4() { ret i64 0 }
   define i64 @test_nzcv_spill_fold() { ret i64 0 }
 ...
 ---
@@ -121,6 +122,24 @@ body:             |
     RET_ReallyLR implicit $s0
 ...
 ---
+# CHECK-LABEL: name: test_subreg_fill_fold4
+# Ensure the COPY is maintained when its result register class is not compatible
+# with the fill load's.
+name:            test_subreg_fill_fold4
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: gpr64sp }
+body:             |
+  bb.0:
+    %0 = COPY $wzr
+    INLINEASM &nop, 1, 12, implicit-def dead $x0, 12, implicit-def dead $x1, 12, implicit-def dead $x2, 12, implicit-def dead $x3, 12, implicit-def dead $x4, 12, implicit-def dead $x5, 12, implicit-def dead $x6, 12, implicit-def dead $x7, 12, implicit-def dead $x8, 12, implicit-def dead $x9, 12, implicit-def dead $x10, 12, implicit-def dead $x11, 12, implicit-def dead $x12, 12, implicit-def dead $x13, 12, implicit-def dead $x14, 12, implicit-def dead $x15, 12, implicit-def dead $x16, 12, implicit-def dead $x17, 12, implicit-def dead $x18, 12, implicit-def dead $x19, 12, implicit-def dead $x20, 12, implicit-def dead $x21, 12, implicit-def dead $x22, 12, implicit-def dead $x23, 12, implicit-def dead $x24, 12, implicit-def dead $x25, 12, implicit-def dead $x26, 12, implicit-def dead $x27, 12, implicit-def dead $x28, 12, implicit-def dead $fp, 12, implicit-def dead $lr, 12, implicit-def $sp
+    ; CHECK: %2:gpr32 = LDRWui %stack.0, 0 :: (load (s32) from %stack.0)
+    ; CHECK: undef %1.sub_32:gpr64sp = COPY %2
+    undef %1.sub_32:gpr64sp = COPY %0
+    $x0 = COPY %1
+    RET_ReallyLR implicit $x0
+...
+---
 # CHECK-LABEL: name: test_nzcv_spill_fold
 # Ensure that nzcv COPY cannot be folded.
 name:            test_nzcv_spill_fold
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 83c9b73c57570..2b16dd0f29ecc 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s
-# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s --check-prefix=EXPAND
 --- |
   ; ModuleID = '<stdin>'
   source_filename = "<stdin>"
@@ -14,13 +14,14 @@
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #2 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #2 { entry: unreachable }
 
   attributes #0 = { nounwind "target-features"="+sve" }
   attributes #1 = { nounwind "target-features"="+sve2p1" }
+  attributes #2 = { nounwind "target-features"="+sve,+sme2" "aarch64_pstate_sm_enabled" }
 
 ...
 ---
@@ -318,10 +319,10 @@ registers:
   - { id: 0, class: zpr2 }
 stack:
 liveins:
-  - { reg: '$z0_z1', virtual-reg: '%0' }
+  - { reg: '$z1_z2', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1
+    liveins: $z1_z2
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr2
     ; CHECK: stack:
@@ -329,12 +330,12 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
 
-    %0:zpr2 = COPY $z0_z1
+    %0:zpr2 = COPY $z1_z2
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -345,7 +346,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1 = COPY %0
+    $z1_z2 = COPY %0
     RET_ReallyLR
 ...
 ---
@@ -439,10 +440,10 @@ registers:
   - { id: 0, class: zpr4 }
 stack:
 liveins:
-  - { reg: '$z0_z1_z2_z3', virtual-reg: '%0' }
+  - { reg: '$z1_z2_z3_z4', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1_z2_z3
+    liveins: $z1_z2_z3_z4
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr4
     ; CHECK: stack:
@@ -450,16 +451,16 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: STR_ZXI $z2, $sp, 2
-    ; EXPAND: STR_ZXI $z3, $sp, 3
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
-    ; EXPAND: $z2 = LDR_ZXI $sp, 2
-    ; EXPAND: $z3 = LDR_ZXI $sp, 3
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: STR_ZXI $z3, $sp, 2
+    ; EXPAND: STR_ZXI $z4, $sp, 3
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
+    ; EXPAND: $z3 = LDR_ZXI $sp, 2
+    ; EXPAND: $z4 = LDR_ZXI $sp, 3
 
-    %0:zpr4 = COPY $z0_z1_z2_z3
+    %0:zpr4 = COPY $z1_z2_z3_z4
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -470,7 +471,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1_z2_z3 = COPY %0
+    $z1_z2_z3_z4 = COPY %0
     RET_ReallyLR
 ...
 ---
diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
index ae70f91a4ec64..c3c39f4d9cee2 100644
--- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
@@ -12,7 +12,7 @@ body:             |
   bb.0:
     liveins: $p0, $z0
 
-    ; CHECK: add_x
+    ; CHECK: name: add_x
     ; CHECK-NOT: MOVPRFX
     ; CHECK: $z0 = FADD_ZPmZ_S renamable $p0, killed $z0, renamable $z0
     ; CHECK-NEXT: RET
@@ -21,22 +21,117 @@ body:             |
 
 ...
 
-# CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
 ---
 name: expand_mls_to_msb
 body:             |
   bb.0:
+    ; CHECK: name: expand_mls_to_msb
+    ; CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLS_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
 
-# CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
 ---
 name: expand_mla_to_mad
 body:             |
   bb.0:
+    ; CHECK: name: expand_mla_to_mad
+    ; CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
+
+---
+name: expand_transfer_implicit_defs
+body:             |
+  bb.0:
+    ; CHECK: name: expand_transfer_implicit_defs
+    ; CHECK:      BUNDLE
+    ; CHECK-SAME: implicit-def $z0_z1_z2_z3
+    liveins: $z1, $z2, $p0
+    renamable $z0 = FADD_ZPZZ_D_UNDEF killed $p0, killed $z1, killed $z2, implicit-def $z0_z1_z2_z3
+    RET_ReallyLR implicit $z0_z1_z2_z3
+...
+
+---
+name: unary_undef_operand
+body:             |
+  bb.0:
+    liveins: $p0, $z0
+
+    ; CHECK: name: unary_undef_operand
+    ; CHECK: $z0 = MOVPRFX_ZZ undef $z1
+    ; CHECK: $z0 = ABS_ZPmZ_S internal killed $z0, renamable $p0, killed undef renamable $z1
+    ; NOTE: Unary _UNDEF psuedo instructions ignore the passthru operand.
+    renamable $z0 = ABS_ZPmZ_S_UNDEF renamable $z0, renamable $p0, killed undef renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: binop_undef_operand
+body:             |
+  bb.0:
+    liveins: $p0, $z1
+
+    ; CHECK: name: binop_undef_operand
+    ; CHECK-NOT: MOVPRFX
+    ; CHECK: $z0 = SMIN_ZPmZ_S renamable $p0, killed undef $z0, killed renamable $z1
+    renamable $z0 = SMIN_ZPZZ_S_UNDEF renamable $p0, undef renamable $z0, killed renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: binop_undef_operand_requires_movpfrx
+body:             |
+  bb.0:
+    liveins: $p0, $z1
+
+    ; CHECK: name: binop_undef_operand_requires_movpfrx
+    ; CHECK: $z0 = MOVPRFX_ZZ undef $z2
+    ; CHECK: $z0 = SMIN_ZPmZ_S renamable $p0, internal killed $z0, killed renamable $z1
+    renamable $z0 = SMIN_ZPZZ_S_UNDEF renamable $p0, undef renamable $z2, killed renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: binop_undef_operand_requires_zeroing_movpfrx
+body:             |
+  bb.0:
+    liveins: $p0, $z1
+
+    ; CHECK: name: binop_undef_operand_requires_zeroing_movpfrx
+    ; CHECK: $z0 = MOVPRFX_ZPzZ_S $p0, undef $z2
+    ; CHECK: $z0 = ADD_ZPmZ_S renamable $p0, internal killed $z0, killed renamable $z1
+    renamable $z0 = ADD_ZPZZ_S_ZERO renamable $p0, undef renamable $z2, killed renamable $z1
+    RET_ReallyLR
+
+...
+
+---
+name: ternaryop_undef_operand
+body:             |
+  bb.0:
+    liveins: $p0, $z1, $z2
+    ; CHECK: name: ternaryop_undef_operand
+    ; CHECK-NOT: MOVPRFX
+    ; CHECK: $z0 = MLA_ZPmZZ_B killed renamable $p0, killed undef $z0, killed renamable $z1, killed renamable $z2
+    renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed undef renamable $z0, killed renamable $z1, killed renamable $z2
+    RET_ReallyLR implicit $z0
+...
+
+---
+name: ternaryop_undef_operand_requires_movprfx
+body:             |
+  bb.0:
+    liveins: $p0, $z1, $z2
+    ; CHECK: name: ternaryop_undef_operand_requires_movprfx
+    ; CHECK: $z0 = MOVPRFX_ZZ undef $z3
+    ; CHECK: $z0 = MLA_ZPmZZ_B killed renamable $p0, internal killed $z0, killed renamable $z1, killed renamable $z2
+    renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed undef renamable $z3, killed renamable $z1, killed renamable $z2
+    RET_ReallyLR implicit $z0
+...
diff --git a/llvm/test/CodeGen/AArch64/unsupported-object-format-err.ll b/llvm/test/CodeGen/AArch64/unsupported-object-format-err.ll
new file mode 100644
index 0000000000000..0767425d12737
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/unsupported-object-format-err.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -mtriple=aarch64-pc-unknown-xcoff -filetype=null %s 2>&1 | FileCheck -check-prefix=OBJFORMAT %s
+; RUN: not llc -mtriple=aarch64-pc-unknown-goff -filetype=null %s 2>&1 | FileCheck -check-prefix=OBJFORMAT %s
+
+; RUN: not llc -mtriple=aarch64-unknown-linux-coff -filetype=null %s 2>&1 | FileCheck -check-prefix=MCINIT %s
+; CHECK: LLVM ERROR: cannot initialize MC for non-Windows COFF object files
+
+; Make sure there is no crash or assert with unexpected object
+; formats.
+
+; OBJFORMAT: LLVM ERROR: unsupported object format
+; MCINIT: LLVM ERROR: cannot initialize MC for non-Windows COFF object files
+define void @foo() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 8a84d3ca2328c..59dfcf9850a49 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx)  {
 ;
 ; CHECK-SD-FP16-LABEL: add_v3HalfH:
 ; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    movi d1, #0000000000000000
 ; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-FP16-NEXT:    mov v0.h[3], wzr
+; CHECK-SD-FP16-NEXT:    mov v0.h[3], v1.h[0]
 ; CHECK-SD-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
 ; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
 ; CHECK-SD-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 32e461ba09f06..e1ef3f9be0a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
 
 define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
 ; GCN-LABEL: s_andn2_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
index dde566d9643d8..5b8c2840b0156 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir
@@ -1,8 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: bswap_i32_vv
@@ -21,7 +19,6 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935
     ; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec
     ; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]]
-    ;
     ; GFX8-LABEL: name: bswap_i32_vv
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -29,22 +26,6 @@ body: |
     ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051
     ; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]]
-    ;
-    ; GFX9-LABEL: name: bswap_i32_vv
-    ; GFX9: liveins: $vgpr0
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051
-    ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec
-    ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]]
-    ;
-    ; GFX10-LABEL: name: bswap_i32_vv
-    ; GFX10: liveins: $vgpr0
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051
-    ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec
-    ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_BSWAP %0
     S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
index fa95f33909b76..0a4cb3ccf2957 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s
 
 ---
@@ -24,24 +24,6 @@ body: |
     ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
     ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]]
     ;
-    ; GFX9-LABEL: name: fshr_s32
-    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]]
-    ;
-    ; GFX10-LABEL: name: fshr_s32
-    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]]
-    ;
     ; GFX11-LABEL: name: fshr_s32
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
index cebdffc74847c..eba64b853ac05 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
@@ -223,37 +223,37 @@ body: |
     ; GFX7-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX9-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX10-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX11-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX12-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
index eafc96dd32bdd..474f1308d8e24 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
@@ -252,30 +252,30 @@ body: |
     ; GFX7-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ;
     ; GFX9-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     ;
     ; GFX10-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 1)
+    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
index 2675295ea98ed..ae010a872a41d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir
@@ -22,6 +22,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_s32_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -51,6 +52,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p0) :: (store seq_cst (<2 x s16>))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_v2s16_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -80,6 +82,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](p3), [[COPY1]](p0) :: (store seq_cst (p3))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p3_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -109,6 +112,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p0) :: (store seq_cst (p5))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p5_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -138,6 +142,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p6) = COPY $vgpr0
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2
     ; GFX7-NEXT: G_STORE [[COPY]](p6), [[COPY1]](p0) :: (store seq_cst (p6))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p6_seq_cst
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -167,6 +172,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -193,15 +199,16 @@ body: |
     ; GFX7-LABEL: name: atomic_store_flat_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p0) :: (store seq_cst (<2 x s32>))
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s32>))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p0) :: (store seq_cst (<2 x s32>))
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s32>))
     %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:vgpr(p0) = COPY $vgpr2_vgpr3
     G_STORE %0, %1 :: (store seq_cst (<2 x s32>), align 8, addrspace 0)
@@ -225,6 +232,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p0) :: (store seq_cst (<4 x s16>))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_v4s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -254,6 +262,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p0) :: (store seq_cst (p0))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p0_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -282,6 +291,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY]](p1), [[COPY1]](p0) :: (store seq_cst (p1))
+    ;
     ; GFX9-LABEL: name: atomic_store_flat_p1_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/known-bits-sbfe.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/known-bits-sbfe.mir
new file mode 100644
index 0000000000000..7a6e07ddf2290
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/known-bits-sbfe.mir
@@ -0,0 +1,253 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -passes='print<gisel-value-tracking>' %s -filetype=null 2>&1 | FileCheck %s
+
+---
+name: test_s_bfe_u32_constants
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_u32_constants
+  ; CHECK-NEXT: %cst:sgpr_32 KnownBits:00000000000000001111111111111111 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:00000000000000000000000000001111 SignBits:28
+    %cst:sgpr_32(s32) = G_CONSTANT i32 65535
+    %bfe:sgpr_32(s32) = S_BFE_U32 %cst, 262156, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_i32_constants
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_i32_constants
+  ; CHECK-NEXT: %cst:sgpr_32 KnownBits:00000000000000001111111111111111 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:11111111111111111111111111111111 SignBits:32
+    %cst:sgpr_32(s32) = G_CONSTANT i32 65535
+    %bfe:sgpr_32(s32) = S_BFE_I32 %cst, 262156, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_u64_constants
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_u64_constants
+  ; CHECK-NEXT: %cst:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000001111111111111111 SignBits:48
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000000000000000001111 SignBits:60
+    %cst:sgpr_64(s64) = G_CONSTANT i64 65535
+    %bfe:sgpr_64(s64) = S_BFE_U64 %cst, 262156, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
+---
+name: test_s_bfe_i64_constants
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_i64_constants
+  ; CHECK-NEXT: %cst:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000001111111111111111 SignBits:48
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:1111111111111111111111111111111111111111111111111111111111111111 SignBits:64
+    %cst:sgpr_64(s64) = G_CONSTANT i64 65535
+    %bfe:sgpr_64(s64) = S_BFE_I64 %cst, 262156, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
+---
+name: test_s_bfe_u32_middle_bits_unknown
+body: |
+  bb.0:
+  ; Extract [8:16) but the middle 4 bits are ????
+    liveins: $sgpr0
+
+  ; CHECK-LABEL: name: @test_s_bfe_u32_middle_bits_unknown
+  ; CHECK-NEXT: %input:sgpr_32 KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %cst:sgpr_32 KnownBits:00000000000000001100001111111111 SignBits:16
+  ; CHECK-NEXT: %mask:sgpr_32 KnownBits:00000000000000000011110000000000 SignBits:18
+  ; CHECK-NEXT: %masked_input:sgpr_32 KnownBits:000000000000000000????0000000000 SignBits:18
+  ; CHECK-NEXT: %merged:sgpr_32 KnownBits:000000000000000011????1111111111 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:00000000000000000000000011????11 SignBits:24
+    %input:sgpr_32(s32) = COPY $sgpr0
+    %cst:sgpr_32(s32) = G_CONSTANT i32 50175
+    %mask:sgpr_32(s32) = G_CONSTANT i32 15360
+    %masked_input:sgpr_32(s32) = G_AND %input, %mask
+    %merged:sgpr_32(s32) = G_OR %masked_input, %cst
+    %bfe:sgpr_32(s32) = S_BFE_U32 %merged, 524296, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_i32_middle_bits_unknown
+body: |
+  bb.0:
+  ; Extract [8:16) but the middle 4 bits are ????
+    liveins: $sgpr0
+
+  ; CHECK-LABEL: name: @test_s_bfe_i32_middle_bits_unknown
+  ; CHECK-NEXT: %input:sgpr_32 KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %cst:sgpr_32 KnownBits:00000000000000001100001111111111 SignBits:16
+  ; CHECK-NEXT: %mask:sgpr_32 KnownBits:00000000000000000011110000000000 SignBits:18
+  ; CHECK-NEXT: %masked_input:sgpr_32 KnownBits:000000000000000000????0000000000 SignBits:18
+  ; CHECK-NEXT: %merged:sgpr_32 KnownBits:000000000000000011????1111111111 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:11111111111111111111111111????11 SignBits:26
+    %input:sgpr_32(s32) = COPY $sgpr0
+    %cst:sgpr_32(s32) = G_CONSTANT i32 50175
+    %mask:sgpr_32(s32) = G_CONSTANT i32 15360
+    %masked_input:sgpr_32(s32) = G_AND %input, %mask
+    %merged:sgpr_32(s32) = G_OR %masked_input, %cst
+    %bfe:sgpr_32(s32) = S_BFE_I32 %merged, 524296, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_u64_middle_bits_unknown
+body: |
+  bb.0:
+  ; Extract [8:16) but the middle 4 bits are ????
+    liveins: $sgpr0_sgpr1
+
+  ; CHECK-LABEL: name: @test_s_bfe_u64_middle_bits_unknown
+  ; CHECK-NEXT: %input:sgpr_64 KnownBits:???????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %cst:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000001100001111111111 SignBits:48
+  ; CHECK-NEXT: %mask:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000000011110000000000 SignBits:50
+  ; CHECK-NEXT: %masked_input:sgpr_64 KnownBits:00000000000000000000000000000000000000000000000000????0000000000 SignBits:50
+  ; CHECK-NEXT: %merged:sgpr_64 KnownBits:00000000000000000000000000000000000000000000000011????1111111111 SignBits:48
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000000000000011????11 SignBits:56
+    %input:sgpr_64(s64) = COPY $sgpr0_sgpr1
+    %cst:sgpr_64(s64) = G_CONSTANT i64 50175
+    %mask:sgpr_64(s64) = G_CONSTANT i64 15360
+    %masked_input:sgpr_64(s64) = G_AND %input, %mask
+    %merged:sgpr_64(s64) = G_OR %masked_input, %cst
+    %bfe:sgpr_64(s64) = S_BFE_U64 %merged, 524296, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
+---
+name: test_s_bfe_i64_middle_bits_unknown
+body: |
+  bb.0:
+  ; Extract [8:16) but the middle 4 bits are ????
+    liveins: $sgpr0_sgpr1
+
+  ; CHECK-LABEL: name: @test_s_bfe_i64_middle_bits_unknown
+  ; CHECK-NEXT: %input:sgpr_64 KnownBits:???????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %cst:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000001100001111111111 SignBits:48
+  ; CHECK-NEXT: %mask:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000000011110000000000 SignBits:50
+  ; CHECK-NEXT: %masked_input:sgpr_64 KnownBits:00000000000000000000000000000000000000000000000000????0000000000 SignBits:50
+  ; CHECK-NEXT: %merged:sgpr_64 KnownBits:00000000000000000000000000000000000000000000000011????1111111111 SignBits:48
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:1111111111111111111111111111111111111111111111111111111111????11 SignBits:58
+    %input:sgpr_64(s64) = COPY $sgpr0_sgpr1
+    %cst:sgpr_64(s64) = G_CONSTANT i64 50175
+    %mask:sgpr_64(s64) = G_CONSTANT i64 15360
+    %masked_input:sgpr_64(s64) = G_AND %input, %mask
+    %merged:sgpr_64(s64) = G_OR %masked_input, %cst
+    %bfe:sgpr_64(s64) = S_BFE_I64 %merged, 524296, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
+---
+name: test_s_bfe_i32_g_constants
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_i32_g_constants
+  ; CHECK-NEXT: %src0:sgpr_32 KnownBits:00000000000000001111111111111111 SignBits:16
+  ; CHECK-NEXT: %src1:sgpr_32 KnownBits:00000000000001000000000000001100 SignBits:13
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:11111111111111111111111111111111 SignBits:32
+    %src0:sgpr_32(s32) = G_CONSTANT i32 65535
+    %src1:sgpr_32(s32) = G_CONSTANT i32 262156
+    %bfe:sgpr_32(s32) = S_BFE_I32 %src0, %src1, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_u64_g_constants
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_u64_g_constants
+  ; CHECK-NEXT: %src0:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000001111111111111111 SignBits:48
+  ; CHECK-NEXT: %src1:sgpr_32 KnownBits:00000000000001000000000000001100 SignBits:13
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000000000000000001111 SignBits:60
+    %src0:sgpr_64(s64) = G_CONSTANT i64 65535
+    %src1:sgpr_32(s32) = G_CONSTANT i32 262156
+    %bfe:sgpr_64(s64) = S_BFE_U64 %src0, %src1, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
+---
+name: test_s_bfe_i32_g_constants_lookthrough
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_i32_g_constants_lookthrough
+  ; CHECK-NEXT: %src0:sgpr_32 KnownBits:00000000000000001111111111111111 SignBits:16
+  ; CHECK-NEXT: %src1:sgpr_32 KnownBits:000001000000000000001100 SignBits:5
+  ; CHECK-NEXT: %src1_ext:sgpr_32 KnownBits:00000000000001000000000000001100 SignBits:13
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:11111111111111111111111111111111 SignBits:32
+    %src0:sgpr_32(s32) = G_CONSTANT i32 65535
+    %src1:sgpr_32(s24) = G_CONSTANT i24 262156
+    %src1_ext:sgpr_32(s32) = G_ZEXT %src1
+    %bfe:sgpr_32(s32) = S_BFE_I32 %src0, %src1_ext, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_u64_g_constants_lookthrough
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; CHECK-LABEL: name: @test_s_bfe_u64_g_constants_lookthrough
+  ; CHECK-NEXT: %src0:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000001111111111111111 SignBits:48
+  ; CHECK-NEXT: %src1:sgpr_32 KnownBits:000001000000000000001100 SignBits:5
+  ; CHECK-NEXT: %src1_ext:sgpr_32 KnownBits:00000000000001000000000000001100 SignBits:13
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000000000000000001111 SignBits:60
+    %src0:sgpr_64(s64) = G_CONSTANT i64 65535
+    %src1:sgpr_32(s24) = G_CONSTANT i24 262156
+    %src1_ext:sgpr_32(s32) = G_ZEXT %src1
+    %bfe:sgpr_64(s64) = S_BFE_U64 %src0, %src1, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
+---
+name: test_s_bfe_u32_trash_bits
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; Check that the 6th bit is ignored for u32. The lower 6 bits are
+  ; 101100 but we should mask out the first 1 for the 32 bit version.
+  ; CHECK-LABEL: name: @test_s_bfe_u32_trash_bits
+  ; CHECK-NEXT: %cst:sgpr_32 KnownBits:00000000000000001111111111111111 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:00000000000000000000000000001111 SignBits:28
+    %cst:sgpr_32(s32) = G_CONSTANT i32 65535
+    %bfe:sgpr_32(s32) = S_BFE_U32 %cst, 262252, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_i32_trash_bits
+body: |
+  bb.0:
+  ; Extract [12:16)
+  ; Check that the 6th bit is ignored for i32. The lower 6 bits are
+  ; 101100 but we should mask out the first 1 for the 32 bit version.
+  ; CHECK-LABEL: name: @test_s_bfe_i32_trash_bits
+  ; CHECK-NEXT: %cst:sgpr_32 KnownBits:00000000000000001111111111111111 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_32 KnownBits:11111111111111111111111111111111 SignBits:32
+    %cst:sgpr_32(s32) = G_CONSTANT i32 65535
+    %bfe:sgpr_32(s32) = S_BFE_I32 %cst, 262252, implicit-def $scc
+    $sgpr0 = COPY %bfe
+...
+---
+name: test_s_bfe_u64_constants_sixth_bit
+body: |
+  bb.0:
+  ; Extract [32:48)
+  ; Check we correctly read 6 bits for the width on 64 bit BFEs.
+  ; CHECK-LABEL: name: @test_s_bfe_u64_constants_sixth_bit
+  ; CHECK-NEXT: %cst:sgpr_64 KnownBits:0000000000000000111111111111111100000000000000000000000000000000 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:0000000000000000000000000000000000000000000000001111111111111111 SignBits:48
+    %cst:sgpr_64(s64) = G_CONSTANT i64 281470681743360
+    %bfe:sgpr_64(s64) = S_BFE_U64 %cst, 1048608, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
+---
+name: test_s_bfe_i64_constants_sixth_bit
+body: |
+  bb.0:
+  ; Extract [32:48)
+  ; Check we correctly read 6 bits for the width on 64 bit BFEs.
+  ; CHECK-LABEL: name: @test_s_bfe_i64_constants_sixth_bit
+  ; CHECK-NEXT: %cst:sgpr_64 KnownBits:0000000000000000111111111111111100000000000000000000000000000000 SignBits:16
+  ; CHECK-NEXT: %bfe:sgpr_64 KnownBits:1111111111111111111111111111111111111111111111111111111111111111 SignBits:64
+    %cst:sgpr_64(s64) = G_CONSTANT i64 281470681743360
+    %bfe:sgpr_64(s64) = S_BFE_I64 %cst, 1048608, implicit-def $scc
+    $sgpr0_sgpr1 = COPY %bfe
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index 12b37c386c140..afabc7b62386f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
 
 define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) {
 ; GCN-LABEL: s_orn2_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 530f4cf53321e..1eb8457cd4a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -254,27 +254,13 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x45800000
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0xfffff000
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1000
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xfffff000, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xfffff000, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x80000001
+; CHECK-NEXT:    v_mul_hi_i32 v1, v0, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 11, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i32 %num, 4096
@@ -327,42 +313,21 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-LABEL: v_srem_v2i32_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x45800000
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
-; CGP-NEXT:    v_mov_b32_e32 v5, 0x1000
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT:    v_lshlrev_b32_e32 v7, 12, v7
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x80000001
+; CGP-NEXT:    v_mul_hi_i32 v3, v0, v2
+; CGP-NEXT:    v_mul_hi_i32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v1
+; CGP-NEXT:    v_ashrrev_i32_e32 v3, 11, v3
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 11, v2
+; CGP-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; CGP-NEXT:    v_lshrrev_b32_e32 v5, 31, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, 0xfffff000, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff000, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result
@@ -372,27 +337,14 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0xffed2705
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 0xffed2705, v0
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0xd9528441
+; CHECK-NEXT:    v_mul_hi_i32 v1, v0, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 20, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i32 %num, 1235195
@@ -445,42 +397,22 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-LABEL: v_srem_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x4996c7d8
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; CGP-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v7, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 0xffed2705, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    v_mov_b32_e32 v2, 0xd9528441
+; CGP-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
+; CGP-NEXT:    v_mul_hi_i32 v4, v0, v2
+; CGP-NEXT:    v_mul_hi_i32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v1
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 20, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 20, v2
+; CGP-NEXT:    v_lshrrev_b32_e32 v5, 31, v4
+; CGP-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
index 2bd1b8bf3f3f6..d22a4b978980f 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -45,6 +45,9 @@
     define amdgpu_kernel void @copy_agpr_to_agpr_tuple() #0 { ret void }
     define amdgpu_kernel void @copy_agpr_to_agpr_tuple_kill() #0 { ret void }
 
+    define amdgpu_kernel void @look_for_vgpr_killed() #0 { ret void }
+    define amdgpu_kernel void @look_for_vgpr_killed_tuple() #0 { ret void }
+
     attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
 ...
 
@@ -1517,3 +1520,83 @@ body:             |
     renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
 ...
+
+# Make sure the expansion of the a-to-a copy doesn't introduce a use
+# after kill of the source vgpr
+---
+name: look_for_vgpr_killed
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $agpr0
+
+    ; GFX908-LABEL: name: look_for_vgpr_killed
+    ; GFX908: liveins: $agpr0
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX908-NEXT: S_NOP 0, implicit $vgpr0
+    ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ;
+    ; GFX90A-LABEL: name: look_for_vgpr_killed
+    ; GFX90A: liveins: $agpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit killed $vgpr0
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    ;
+    ; GFX942-LABEL: name: look_for_vgpr_killed
+    ; GFX942: liveins: $agpr0
+    ; GFX942-NEXT: {{  $}}
+    ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX942-NEXT: S_NOP 0, implicit killed $vgpr0
+    ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $agpr0 = COPY $vgpr0
+    S_NOP 0, implicit killed $vgpr0
+    $agpr1 = COPY $agpr0
+
+...
+
+---
+name: look_for_vgpr_killed_tuple
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $agpr0
+
+    ; GFX908-LABEL: name: look_for_vgpr_killed_tuple
+    ; GFX908: liveins: $agpr0
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1
+    ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ;
+    ; GFX90A-LABEL: name: look_for_vgpr_killed_tuple
+    ; GFX90A: liveins: $agpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit killed $vgpr0_vgpr1
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    ;
+    ; GFX942-LABEL: name: look_for_vgpr_killed_tuple
+    ; GFX942: liveins: $agpr0
+    ; GFX942-NEXT: {{  $}}
+    ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; GFX942-NEXT: S_NOP 0, implicit killed $vgpr0_vgpr1
+    ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+    $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+    $agpr0 = COPY $vgpr0
+    S_NOP 0, implicit killed $vgpr0_vgpr1
+    $agpr1 = COPY $agpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index 6742ae6c1d584..f6465de86fa4f 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -6,17 +6,17 @@
 define amdgpu_kernel void @remat_constant_voids_spill(ptr addrspace(1) %p) #1 {
 ; GFX908-LABEL: remat_constant_voids_spill:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    v_accvgpr_write_b32 a1, 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a5, 6
-; GFX908-NEXT:    v_accvgpr_write_b32 a6, 7
-; GFX908-NEXT:    v_accvgpr_write_b32 a7, 8
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, 9
-; GFX908-NEXT:    v_accvgpr_write_b32 a2, 2
-; GFX908-NEXT:    v_accvgpr_write_b32 a3, 3
-; GFX908-NEXT:    v_accvgpr_write_b32 a4, 4
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, 1
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, 2
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, 3
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, 4
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_accvgpr_write_b32 a1, 5
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, 5
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, 6
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, 7
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, 8
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, 9
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
deleted file mode 100644
index 8945708e0f0ca..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ /dev/null
@@ -1,2853 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -amdgpu-codegenprepare-widen-16-bit-ops -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s
-; RUN: opt -S -amdgpu-codegenprepare-widen-16-bit-ops -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s
-
-define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
-; SI-LABEL: @add_i3(
-; SI-NEXT:    [[R:%.*]] = add i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = add i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @add_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = add nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = add nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @add_nuw_i3(
-; SI-NEXT:    [[R:%.*]] = add nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = add nuw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @add_nuw_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = add nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = add nuw nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) {
-; SI-LABEL: @sub_i3(
-; SI-NEXT:    [[R:%.*]] = sub i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = sub i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @sub_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = sub nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = sub nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @sub_nuw_i3(
-; SI-NEXT:    [[R:%.*]] = sub nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = sub nuw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @sub_nuw_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = sub nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = sub nuw nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) {
-; SI-LABEL: @mul_i3(
-; SI-NEXT:    [[R:%.*]] = mul i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = mul i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @mul_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = mul nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = mul nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @mul_nuw_i3(
-; SI-NEXT:    [[R:%.*]] = mul nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = mul nuw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @mul_nuw_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = mul nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = mul nuw nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) {
-; SI-LABEL: @shl_i3(
-; SI-NEXT:    [[R:%.*]] = shl i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = shl i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @shl_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = shl nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = shl nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @shl_nuw_i3(
-; SI-NEXT:    [[R:%.*]] = shl nuw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = shl nuw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) {
-; SI-LABEL: @shl_nuw_nsw_i3(
-; SI-NEXT:    [[R:%.*]] = shl nuw nsw i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_nsw_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = shl nuw nsw i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) {
-; SI-LABEL: @lshr_i3(
-; SI-NEXT:    [[R:%.*]] = lshr i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = lshr i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) {
-; SI-LABEL: @lshr_exact_i3(
-; SI-NEXT:    [[R:%.*]] = lshr exact i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_exact_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = lshr exact i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = lshr exact i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) {
-; SI-LABEL: @ashr_i3(
-; SI-NEXT:    [[R:%.*]] = ashr i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_i3(
-; VI-NEXT:    [[TMP1:%.*]] = sext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = ashr i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) {
-; SI-LABEL: @ashr_exact_i3(
-; SI-NEXT:    [[R:%.*]] = ashr exact i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_exact_i3(
-; VI-NEXT:    [[TMP1:%.*]] = sext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = ashr exact i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = ashr exact i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @and_i3(i3 %a, i3 %b) {
-; SI-LABEL: @and_i3(
-; SI-NEXT:    [[R:%.*]] = and i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @and_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = and i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @or_i3(i3 %a, i3 %b) {
-; SI-LABEL: @or_i3(
-; SI-NEXT:    [[R:%.*]] = or i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @or_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = or i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = or i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) {
-; SI-LABEL: @xor_i3(
-; SI-NEXT:    [[R:%.*]] = xor i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i3 [[R]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @xor_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %r = xor i3 %a, %b
-  store volatile i3 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_eq_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp eq i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_eq_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp eq i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_ne_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp ne i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ne_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ne i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_ugt_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp ugt i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ugt_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ugt i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_uge_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp uge i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_uge_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp uge i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp uge i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_ult_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp ult i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ult_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ult i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_ule_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp ule i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ule_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ule i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_sgt_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp sgt i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sgt_i3(
-; VI-NEXT:    [[TMP1:%.*]] = sext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sgt i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_sge_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp sge i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sge_i3(
-; VI-NEXT:    [[TMP1:%.*]] = sext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp sge i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sge i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_slt_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp slt i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_slt_i3(
-; VI-NEXT:    [[TMP1:%.*]] = sext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp slt i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) {
-; SI-LABEL: @select_sle_i3(
-; SI-NEXT:    [[CMP:%.*]] = icmp sle i3 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i3 [[A]], i3 [[B]]
-; SI-NEXT:    store volatile i3 [[SEL]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sle_i3(
-; VI-NEXT:    [[TMP1:%.*]] = sext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i3 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp sle i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i3 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i3 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
-; VI-NEXT:    store volatile i3 [[TMP7]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sle i3 %a, %b
-  %sel = select i1 %cmp, i3 %a, i3 %b
-  store volatile i3 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-declare i3 @llvm.bitreverse.i3(i3)
-define amdgpu_kernel void @bitreverse_i3(i3 %a) {
-; SI-LABEL: @bitreverse_i3(
-; SI-NEXT:    [[BREV:%.*]] = call i3 @llvm.bitreverse.i3(i3 [[A:%.*]])
-; SI-NEXT:    store volatile i3 [[BREV]], ptr addrspace(1) poison, align 1
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @bitreverse_i3(
-; VI-NEXT:    [[TMP1:%.*]] = zext i3 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[TMP1]])
-; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 29
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
-; VI-NEXT:    store volatile i3 [[TMP4]], ptr addrspace(1) poison, align 1
-; VI-NEXT:    ret void
-;
-  %brev = call i3 @llvm.bitreverse.i3(i3 %a)
-  store volatile i3 %brev, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_i16(i16 %a, i16 %b) {
-; SI-LABEL: @add_i16(
-; SI-NEXT:    [[R:%.*]] = add i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = add i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @constant_add_i16() {
-; SI-LABEL: @constant_add_i16(
-; SI-NEXT:    [[R:%.*]] = add i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @constant_add_i16(
-; VI-NEXT:    store volatile i16 3, ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = add i16 1, 2
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @constant_add_nsw_i16() {
-; SI-LABEL: @constant_add_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = add nsw i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @constant_add_nsw_i16(
-; VI-NEXT:    store volatile i16 3, ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = add nsw i16 1, 2
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @constant_add_nuw_i16() {
-; SI-LABEL: @constant_add_nuw_i16(
-; SI-NEXT:    [[R:%.*]] = add nsw i16 1, 2
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @constant_add_nuw_i16(
-; VI-NEXT:    store volatile i16 3, ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = add nsw i16 1, 2
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @add_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = add nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = add nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @add_nuw_i16(
-; SI-NEXT:    [[R:%.*]] = add nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = add nuw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @add_nuw_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = add nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = add nuw nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) {
-; SI-LABEL: @sub_i16(
-; SI-NEXT:    [[R:%.*]] = sub i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = sub i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @sub_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = sub nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = sub nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @sub_nuw_i16(
-; SI-NEXT:    [[R:%.*]] = sub nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = sub nuw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @sub_nuw_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = sub nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = sub nuw nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) {
-; SI-LABEL: @mul_i16(
-; SI-NEXT:    [[R:%.*]] = mul i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = mul i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @mul_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = mul nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = mul nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @mul_nuw_i16(
-; SI-NEXT:    [[R:%.*]] = mul nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = mul nuw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @mul_nuw_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = mul nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = mul nuw nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) {
-; SI-LABEL: @shl_i16(
-; SI-NEXT:    [[R:%.*]] = shl i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = shl i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @shl_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = shl nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = shl nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @shl_nuw_i16(
-; SI-NEXT:    [[R:%.*]] = shl nuw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = shl nuw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) {
-; SI-LABEL: @shl_nuw_nsw_i16(
-; SI-NEXT:    [[R:%.*]] = shl nuw nsw i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_nsw_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = shl nuw nsw i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) {
-; SI-LABEL: @lshr_i16(
-; SI-NEXT:    [[R:%.*]] = lshr i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = lshr i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) {
-; SI-LABEL: @lshr_exact_i16(
-; SI-NEXT:    [[R:%.*]] = lshr exact i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_exact_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = lshr exact i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = lshr exact i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) {
-; SI-LABEL: @ashr_i16(
-; SI-NEXT:    [[R:%.*]] = ashr i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_i16(
-; VI-NEXT:    [[TMP1:%.*]] = sext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = ashr i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) {
-; SI-LABEL: @ashr_exact_i16(
-; SI-NEXT:    [[R:%.*]] = ashr exact i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_exact_i16(
-; VI-NEXT:    [[TMP1:%.*]] = sext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = ashr exact i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = ashr exact i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @constant_lshr_exact_i16(i16 %a, i16 %b) {
-; SI-LABEL: @constant_lshr_exact_i16(
-; SI-NEXT:    [[R:%.*]] = lshr exact i16 4, 1
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @constant_lshr_exact_i16(
-; VI-NEXT:    store volatile i16 2, ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = lshr exact i16 4, 1
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @and_i16(i16 %a, i16 %b) {
-; SI-LABEL: @and_i16(
-; SI-NEXT:    [[R:%.*]] = and i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @and_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = and i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @or_i16(i16 %a, i16 %b) {
-; SI-LABEL: @or_i16(
-; SI-NEXT:    [[R:%.*]] = or i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @or_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = or i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = or i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) {
-; SI-LABEL: @xor_i16(
-; SI-NEXT:    [[R:%.*]] = xor i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile i16 [[R]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @xor_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %r = xor i16 %a, %b
-  store volatile i16 %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_eq_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_eq_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp eq i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_ne_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ne i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ne_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ne i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_ugt_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ugt_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ugt i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_uge_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp uge i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_uge_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp uge i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp uge i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_ult_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ult i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ult_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ult i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_ule_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ule i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ule_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = zext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = zext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ule i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_sgt_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sgt_i16(
-; VI-NEXT:    [[TMP1:%.*]] = sext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sgt i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_sge_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp sge i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sge_i16(
-; VI-NEXT:    [[TMP1:%.*]] = sext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp sge i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sge i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_slt_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_slt_i16(
-; VI-NEXT:    [[TMP1:%.*]] = sext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp slt i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) {
-; SI-LABEL: @select_sle_i16(
-; SI-NEXT:    [[CMP:%.*]] = icmp sle i16 [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[A]], i16 [[B]]
-; SI-NEXT:    store volatile i16 [[SEL]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sle_i16(
-; VI-NEXT:    [[TMP1:%.*]] = sext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = sext i16 [[B:%.*]] to i32
-; VI-NEXT:    [[TMP3:%.*]] = icmp sle i32 [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext i16 [[A]] to i32
-; VI-NEXT:    [[TMP5:%.*]] = sext i16 [[B]] to i32
-; VI-NEXT:    [[TMP6:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
-; VI-NEXT:    store volatile i16 [[TMP7]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sle i16 %a, %b
-  %sel = select i1 %cmp, i16 %a, i16 %b
-  store volatile i16 %sel, ptr addrspace(1) poison
-  ret void
-}
-
-declare i16 @llvm.bitreverse.i16(i16)
-
-define amdgpu_kernel void @bitreverse_i16(i16 %a) {
-; SI-LABEL: @bitreverse_i16(
-; SI-NEXT:    [[BREV:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[A:%.*]])
-; SI-NEXT:    store volatile i16 [[BREV]], ptr addrspace(1) poison, align 2
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @bitreverse_i16(
-; VI-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
-; VI-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[TMP1]])
-; VI-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 16
-; VI-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
-; VI-NEXT:    store volatile i16 [[TMP4]], ptr addrspace(1) poison, align 2
-; VI-NEXT:    ret void
-;
-  %brev = call i16 @llvm.bitreverse.i16(i16 %a)
-  store volatile i16 %brev, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @add_3xi15(
-; SI-NEXT:    [[R:%.*]] = add <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @add_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = add nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @add_nuw_3xi15(
-; SI-NEXT:    [[R:%.*]] = add nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @add_nuw_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = add nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @sub_3xi15(
-; SI-NEXT:    [[R:%.*]] = sub <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @sub_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = sub nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @sub_nuw_3xi15(
-; SI-NEXT:    [[R:%.*]] = sub nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @sub_nuw_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = sub nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @mul_3xi15(
-; SI-NEXT:    [[R:%.*]] = mul <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @mul_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = mul nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @mul_nuw_3xi15(
-; SI-NEXT:    [[R:%.*]] = mul nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @mul_nuw_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @shl_3xi15(
-; SI-NEXT:    [[R:%.*]] = shl <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @shl_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = shl nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @shl_nuw_3xi15(
-; SI-NEXT:    [[R:%.*]] = shl nuw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl nuw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @shl_nuw_nsw_3xi15(
-; SI-NEXT:    [[R:%.*]] = shl nuw nsw <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_nsw_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl nuw nsw <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @lshr_3xi15(
-; SI-NEXT:    [[R:%.*]] = lshr <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = lshr <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @lshr_exact_3xi15(
-; SI-NEXT:    [[R:%.*]] = lshr exact <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_exact_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = lshr exact <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = lshr exact <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @ashr_3xi15(
-; SI-NEXT:    [[R:%.*]] = ashr <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = ashr <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = ashr <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @ashr_exact_3xi15(
-; SI-NEXT:    [[R:%.*]] = ashr exact <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_exact_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = ashr exact <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = ashr exact <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @and_3xi15(
-; SI-NEXT:    [[R:%.*]] = and <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @and_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = and <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = and <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @or_3xi15(
-; SI-NEXT:    [[R:%.*]] = or <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @or_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = or <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = or <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @xor_3xi15(
-; SI-NEXT:    [[R:%.*]] = xor <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i15> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @xor_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = xor <3 x i15> %a, %b
-  store volatile <3 x i15> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_eq_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp eq <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_eq_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp eq <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp eq <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_ne_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp ne <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ne_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ne <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_ugt_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp ugt <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ugt_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ugt <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ugt <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_uge_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp uge <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_uge_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp uge <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp uge <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_ult_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp ult <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ult_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ult <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ult <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_ule_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp ule <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ule_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ule <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ule <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_sgt_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sgt_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp sgt <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sgt <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_sge_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp sge <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sge_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp sge <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sge <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_slt_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp slt <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_slt_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp slt <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp slt <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) {
-; SI-LABEL: @select_sle_3xi15(
-; SI-NEXT:    [[CMP:%.*]] = icmp sle <3 x i15> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i15> [[A]], <3 x i15> [[B]]
-; SI-NEXT:    store volatile <3 x i15> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sle_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i15> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp sle <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i15> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i15> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sle <3 x i15> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  store volatile <3 x i15> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-declare <3 x i15> @llvm.bitreverse.v3i15(<3 x i15>)
-define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) {
-; SI-LABEL: @bitreverse_3xi15(
-; SI-NEXT:    [[BREV:%.*]] = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> [[A:%.*]])
-; SI-NEXT:    store volatile <3 x i15> [[BREV]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @bitreverse_3xi15(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i15> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> [[TMP1]])
-; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP2]], splat (i32 17)
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i15>
-; VI-NEXT:    store volatile <3 x i15> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a)
-  store volatile <3 x i15> %brev, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @add_3xi16(
-; SI-NEXT:    [[R:%.*]] = add <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @add_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = add nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @add_nuw_3xi16(
-; SI-NEXT:    [[R:%.*]] = add nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @add_nuw_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = add nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @add_nuw_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = add nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = add nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @sub_3xi16(
-; SI-NEXT:    [[R:%.*]] = sub <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @sub_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = sub nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @sub_nuw_3xi16(
-; SI-NEXT:    [[R:%.*]] = sub nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @sub_nuw_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = sub nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @sub_nuw_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = sub nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = sub nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @mul_3xi16(
-; SI-NEXT:    [[R:%.*]] = mul <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @mul_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = mul nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @mul_nuw_3xi16(
-; SI-NEXT:    [[R:%.*]] = mul nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @mul_nuw_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = mul nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @mul_nuw_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = mul nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = mul nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @shl_3xi16(
-; SI-NEXT:    [[R:%.*]] = shl <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @shl_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = shl nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @shl_nuw_3xi16(
-; SI-NEXT:    [[R:%.*]] = shl nuw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl nuw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @shl_nuw_nsw_3xi16(
-; SI-NEXT:    [[R:%.*]] = shl nuw nsw <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @shl_nuw_nsw_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = shl nuw nsw <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = shl nuw nsw <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @lshr_3xi16(
-; SI-NEXT:    [[R:%.*]] = lshr <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = lshr <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @lshr_exact_3xi16(
-; SI-NEXT:    [[R:%.*]] = lshr exact <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @lshr_exact_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = lshr exact <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = lshr exact <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @ashr_3xi16(
-; SI-NEXT:    [[R:%.*]] = ashr <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = ashr <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = ashr <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @ashr_exact_3xi16(
-; SI-NEXT:    [[R:%.*]] = ashr exact <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @ashr_exact_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = ashr exact <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = ashr exact <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @and_3xi16(
-; SI-NEXT:    [[R:%.*]] = and <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @and_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = and <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = and <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @or_3xi16(
-; SI-NEXT:    [[R:%.*]] = or <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @or_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = or <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = or <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @xor_3xi16(
-; SI-NEXT:    [[R:%.*]] = xor <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    store volatile <3 x i16> [[R]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @xor_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %r = xor <3 x i16> %a, %b
-  store volatile <3 x i16> %r, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_eq_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp eq <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_eq_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp eq <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp eq <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_ne_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ne <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ne_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ne <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_ugt_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ugt <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ugt_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ugt <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ugt <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_uge_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp uge <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_uge_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp uge <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp uge <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_ult_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ult <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ult_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ult <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ult <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_ule_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp ule <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_ule_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = zext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp ule <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = zext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = zext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp ule <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_sgt_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sgt_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp sgt <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sgt <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_sge_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp sge <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sge_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp sge <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sge <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_slt_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp slt <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_slt_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp slt <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp slt <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
-; SI-LABEL: @select_sle_3xi16(
-; SI-NEXT:    [[CMP:%.*]] = icmp sle <3 x i16> [[A:%.*]], [[B:%.*]]
-; SI-NEXT:    [[SEL:%.*]] = select <3 x i1> [[CMP]], <3 x i16> [[A]], <3 x i16> [[B]]
-; SI-NEXT:    store volatile <3 x i16> [[SEL]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @select_sle_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = sext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = sext <3 x i16> [[B:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP3:%.*]] = icmp sle <3 x i32> [[TMP1]], [[TMP2]]
-; VI-NEXT:    [[TMP4:%.*]] = sext <3 x i16> [[A]] to <3 x i32>
-; VI-NEXT:    [[TMP5:%.*]] = sext <3 x i16> [[B]] to <3 x i32>
-; VI-NEXT:    [[TMP6:%.*]] = select <3 x i1> [[TMP3]], <3 x i32> [[TMP4]], <3 x i32> [[TMP5]]
-; VI-NEXT:    [[TMP7:%.*]] = trunc <3 x i32> [[TMP6]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP7]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %cmp = icmp sle <3 x i16> %a, %b
-  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  store volatile <3 x i16> %sel, ptr addrspace(1) poison
-  ret void
-}
-
-declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>)
-
-define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) {
-; SI-LABEL: @bitreverse_3xi16(
-; SI-NEXT:    [[BREV:%.*]] = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> [[A:%.*]])
-; SI-NEXT:    store volatile <3 x i16> [[BREV]], ptr addrspace(1) poison, align 8
-; SI-NEXT:    ret void
-;
-; VI-LABEL: @bitreverse_3xi16(
-; VI-NEXT:    [[TMP1:%.*]] = zext <3 x i16> [[A:%.*]] to <3 x i32>
-; VI-NEXT:    [[TMP2:%.*]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> [[TMP1]])
-; VI-NEXT:    [[TMP3:%.*]] = lshr <3 x i32> [[TMP2]], splat (i32 16)
-; VI-NEXT:    [[TMP4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i16>
-; VI-NEXT:    store volatile <3 x i16> [[TMP4]], ptr addrspace(1) poison, align 8
-; VI-NEXT:    ret void
-;
-  %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a)
-  store volatile <3 x i16> %brev, ptr addrspace(1) poison
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b7097a9557b75..c7385e4324e2c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7927,7 +7927,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
@@ -8982,7 +8982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9116,7 +9116,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
@@ -10096,9 +10096,15 @@ define i64 @udiv_i64_9divbits(i8 %size) {
 }
 
 define <2 x i64> @srem_zero_zero() {
-; GCN-LABEL: kernel:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_endpgm
+; GFX6-LABEL: srem_zero_zero:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: srem_zero_zero:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %B = srem <2 x i64> zeroinitializer, zeroinitializer
   ret <2 x i64> %B
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
new file mode 100644
index 0000000000000..69bdb1f5066f0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
@@ -0,0 +1,95 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX90A %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX908 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx906 -passes=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefix=NO-AGPR %s
+
+--- |
+  define void @func() {
+    ret void
+  }
+
+  ; Attribute is ignored for gfx90a
+  define void @no_agprs() "amdgpu-agpr-alloc"="0,0" {
+    ret void
+  }
+
+...
+---
+name: func
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4 }
+body:             |
+  ; HAS-AGPR-LABEL: name: func
+  ; HAS-AGPR: bb.0:
+  ; HAS-AGPR-NEXT:   successors: %bb.1(0x80000000)
+  ; HAS-AGPR-NEXT:   liveins: $vgpr0
+  ; HAS-AGPR-NEXT: {{  $}}
+  ; HAS-AGPR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; HAS-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+  ; HAS-AGPR-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; HAS-AGPR-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
+  ; HAS-AGPR-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+  ; HAS-AGPR-NEXT:   [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; HAS-AGPR-NEXT:   [[AV_MOV_2:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 6, implicit $exec
+  ; HAS-AGPR-NEXT: {{  $}}
+  ; HAS-AGPR-NEXT: bb.1:
+  ; HAS-AGPR-NEXT:   [[AV_MOV_3:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 3, implicit $exec
+  ;
+  ; NO-AGPR-LABEL: name: func
+  ; NO-AGPR: bb.0:
+  ; NO-AGPR-NEXT:   successors: %bb.1(0x80000000)
+  ; NO-AGPR-NEXT:   liveins: $vgpr0
+  ; NO-AGPR-NEXT: {{  $}}
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+  ; NO-AGPR-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec
+  ; NO-AGPR-NEXT: {{  $}}
+  ; NO-AGPR-NEXT: bb.1:
+  ; NO-AGPR-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+  bb.0:
+    liveins: $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_e32 65, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %5:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    %6:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec
+
+  bb.1:
+    %7:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+
+...
+
+---
+name: no_agprs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX90A-LABEL: name: no_agprs
+    ; GFX90A: liveins: $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; GFX90A-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    ;
+    ; GFX908-LABEL: name: no_agprs
+    ; GFX908: liveins: $vgpr0
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+    ; GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+    ;
+    ; NO-AGPR-LABEL: name: no_agprs
+    ; NO-AGPR: liveins: $vgpr0
+    ; NO-AGPR-NEXT: {{  $}}
+    ; NO-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+    %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
new file mode 100644
index 0000000000000..029604c2933a9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+; TODO: Add global-isel when it can support bf16
+
+define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
+; GCN-LABEL: llvm_log2_bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_log_bf16_e32 v2, v2
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %log = call bfloat @llvm.log2.bf16(bfloat %src)
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
+; GCN-LABEL: llvm_log2_bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_log_bf16_e32 v2, s0
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %log = call bfloat @llvm.log2.bf16(bfloat %src)
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
+; GCN-LABEL: llvm_exp2_bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_exp_bf16_e32 v2, v2
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
+; GCN-LABEL: llvm_exp2_bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_exp_bf16_e32 v2, s0
+; GCN-NEXT:    global_store_b16 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+declare bfloat @llvm.log2.bf16(bfloat)
+declare bfloat @llvm.exp2.bf16(bfloat)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 92c63fead15ac..7eb7d72e6cb97 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc
   ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc
@@ -56,8 +56,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $vgpr15 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
@@ -112,14 +112,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr19 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr21 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr23 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr25 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
   ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
@@ -671,7 +671,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
@@ -759,17 +759,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
@@ -801,12 +801,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr42_vgpr43 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr40_vgpr41 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr46_vgpr47 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr53 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr53 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.7
   ; GFX90A-NEXT: {{  $}}
@@ -814,7 +814,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
@@ -913,7 +913,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
@@ -955,7 +955,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
@@ -989,7 +989,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr55 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.69
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
new file mode 100644
index 0000000000000..d103423ae1675
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -0,0 +1,630 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+
+; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
+;        See PR33579.
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s
+; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
+
+; OBJ:       Relocations [
+; OBJ-NEXT: ]
+
+; Restrict maximum branch to between +7 and -8 dwords
+
+; Used to emit an always 4 byte instruction. Inline asm always assumes
+; each instruction is the maximum size, 12 bytes in case of gfx1250.
+declare void @llvm.amdgcn.s.sleep(i32) #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
+; GCN-LABEL: uniform_conditional_max_short_forward_branch:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_eq_u32 s0, 0
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
+; GCN-NEXT:  ; %bb.1: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:  .LBB0_2: ; %bb3
+; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_storecnt 0x0
+; GCN-NEXT:    s_endpgm
+bb:
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
+
+bb2:
+; 24 bytes
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %bb3
+
+bb3:
+  store volatile i32 %cnd, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
+; GCN-LABEL: uniform_conditional_min_long_forward_branch:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_eq_u32 s0, 0
+; GCN-NEXT:    s_cbranch_scc0 .LBB1_1
+; GCN-NEXT:  ; %bb.3: ; %bb0
+; GCN-NEXT:    s_add_pc_i64 .LBB1_2-.Lpost_addpc0
+; GCN-NEXT:  .Lpost_addpc0:
+; GCN-NEXT:  .LBB1_1: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:  .LBB1_2: ; %bb3
+; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_storecnt 0x0
+; GCN-NEXT:    s_endpgm
+bb0:
+  %cmp = icmp eq i32 %cnd, 0
+  br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
+
+bb2:
+; 32 bytes
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %bb3
+
+bb3:
+  store volatile i32 %cnd, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 {
+; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_eq_f32 s0, 0
+; GCN-NEXT:    s_cbranch_scc0 .LBB2_1
+; GCN-NEXT:  ; %bb.3: ; %bb0
+; GCN-NEXT:    s_add_pc_i64 .LBB2_2-.Lpost_addpc1
+; GCN-NEXT:  .Lpost_addpc1:
+; GCN-NEXT:  .LBB2_1: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:     ; 32 bytes
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:  .LBB2_2: ; %bb3
+; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_storecnt 0x0
+; GCN-NEXT:    s_endpgm
+bb0:
+  %cmp = fcmp oeq float %cnd, 0.0
+  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
+
+bb2:
+  call void asm sideeffect " ; 32 bytes
+  v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %bb3
+
+bb3:
+  store volatile float %cnd, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
+; GCN-LABEL: min_long_forward_vbranch:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    s_wait_xcnt 0x0
+; GCN-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN-NEXT:    s_mov_b32 s0, exec_lo
+; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GCN-NEXT:    s_cbranch_execnz .LBB3_1
+; GCN-NEXT:  ; %bb.3: ; %bb
+; GCN-NEXT:    s_add_pc_i64 .LBB3_2-.Lpost_addpc2
+; GCN-NEXT:  .Lpost_addpc2:
+; GCN-NEXT:  .LBB3_1: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:     ; 32 bytes
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:  .LBB3_2: ; %bb3
+; GCN-NEXT:    s_wait_alu 0xfffe
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_storecnt 0x0
+; GCN-NEXT:    s_endpgm
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = zext i32 %tid to i64
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext
+  %load = load volatile i32, ptr addrspace(1) %gep
+  %cmp = icmp eq i32 %load, 0
+  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
+
+bb2:
+  call void asm sideeffect " ; 32 bytes
+  v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %bb3
+
+bb3:
+  store volatile i32 %load, ptr addrspace(1) %gep
+  ret void
+}
+
+define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
+; GCN-LABEL: long_backward_sbranch:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:  .LBB4_1: ; %bb2
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_add_co_i32 s0, s0, 1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_cmp_lt_i32 s0, 10
+; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN-NEXT:  ; %bb.3: ; %bb2
+; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
+; GCN-NEXT:    s_add_pc_i64 .LBB4_1-.Lpost_addpc3
+; GCN-NEXT:  .Lpost_addpc3:
+; GCN-NEXT:  .LBB4_2: ; %bb3
+; GCN-NEXT:    s_endpgm
+bb:
+  br label %bb2
+
+bb2:
+  %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
+  ; 24 bytes
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  %inc = add nsw i32 %loop.idx, 1 ; add cost 4
+  %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
+  br i1 %cmp, label %bb2, label %bb3 ; -
+
+bb3:
+  ret void
+}
+
+; Requires expansion of unconditional branch from %bb2 to %bb4 (and
+; expansion of conditional branch from %bb to %bb3.
+
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
+; GCN-LABEL: uniform_unconditional_min_long_forward_branch:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_eq_u32 s0, 0
+; GCN-NEXT:    s_mov_b32 s0, -1
+; GCN-NEXT:    s_cbranch_scc0 .LBB5_1
+; GCN-NEXT:  ; %bb.7: ; %bb0
+; GCN-NEXT:    s_add_pc_i64 .LBB5_4-.Lpost_addpc5
+; GCN-NEXT:  .Lpost_addpc5:
+; GCN-NEXT:  .LBB5_1: ; %Flow
+; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_vccnz .LBB5_3
+; GCN-NEXT:  .LBB5_2: ; %bb2
+; GCN-NEXT:    v_mov_b32_e32 v0, 17
+; GCN-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_storecnt 0x0
+; GCN-NEXT:  .LBB5_3: ; %bb4
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_wait_xcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_storecnt 0x0
+; GCN-NEXT:    s_endpgm
+; GCN-NEXT:  .LBB5_4: ; %bb3
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_cbranch_execnz .LBB5_5
+; GCN-NEXT:  ; %bb.9: ; %bb3
+; GCN-NEXT:    s_add_pc_i64 .LBB5_2-.Lpost_addpc6
+; GCN-NEXT:  .Lpost_addpc6:
+; GCN-NEXT:  .LBB5_5: ; %bb3
+; GCN-NEXT:    s_add_pc_i64 .LBB5_3-.Lpost_addpc4
+; GCN-NEXT:  .Lpost_addpc4:
+bb0:
+  %tmp = icmp ne i32 %arg1, 0
+  br i1 %tmp, label %bb2, label %bb3
+
+bb2:
+  store volatile i32 17, ptr addrspace(1) undef
+  br label %bb4
+
+bb3:
+  ; 32 byte asm
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %bb4
+
+bb4:
+  store volatile i32 63, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr addrspace(1) %arg, i32 %arg1) {
+; GCN-LABEL: uniform_unconditional_min_long_backward_branch:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GCN-NEXT:  .LBB6_1: ; %loop
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_cbranch_vccz .LBB6_2
+; GCN-NEXT:  ; %bb.3: ; %loop
+; GCN-NEXT:    ; in Loop: Header=BB6_1 Depth=1
+; GCN-NEXT:    s_add_pc_i64 .LBB6_1-.Lpost_addpc7
+; GCN-NEXT:  .Lpost_addpc7:
+; GCN-NEXT:  .LBB6_2: ; %DummyReturnBlock
+; GCN-NEXT:    s_endpgm
+entry:
+  br label %loop
+
+loop:
+  ; 32 byte asm
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %loop
+}
+
+; Expansion of branch from %bb1 to %bb3 introduces need to expand
+; branch from %bb0 to %bb2
+
+define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
+; GCN-LABEL: expand_requires_expand:
+; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_lt_i32 s0, 0
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_vccnz .LBB7_2
+; GCN-NEXT:  ; %bb.1: ; %bb1
+; GCN-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 3
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:  .LBB7_2: ; %Flow
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_vccz .LBB7_3
+; GCN-NEXT:  ; %bb.5: ; %Flow
+; GCN-NEXT:    s_add_pc_i64 .LBB7_4-.Lpost_addpc8
+; GCN-NEXT:  .Lpost_addpc8:
+; GCN-NEXT:  .LBB7_3: ; %bb2
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:  .LBB7_4: ; %bb3
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+bb0:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %cmp0 = icmp slt i32 %cond0, 0
+  br i1 %cmp0, label %bb2, label %bb1
+
+bb1:
+  %val = load volatile i32, ptr addrspace(4) undef
+  %cmp1 = icmp eq i32 %val, 3
+  br i1 %cmp1, label %bb3, label %bb2
+
+bb2:
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %bb3
+
+bb3:
+; These NOPs prevent tail-duplication-based outlining
+; from firing, which defeats the need to expand the branches and this test.
+  call void asm sideeffect
+  "v_nop_e64", ""() #0
+  call void asm sideeffect
+  "v_nop_e64", ""() #0
+  ret void
+}
+
+; Requires expanding of required skip branch.
+
+define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) #0 {
+; GCN-LABEL: uniform_inside_divergent:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    s_mov_b32 s3, exec_lo
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_cmpx_gt_u32_e32 16, v0
+; GCN-NEXT:    s_cbranch_execnz .LBB8_1
+; GCN-NEXT:  ; %bb.4: ; %entry
+; GCN-NEXT:    s_add_pc_i64 .LBB8_3-.Lpost_addpc9
+; GCN-NEXT:  .Lpost_addpc9:
+; GCN-NEXT:  .LBB8_1: ; %if
+; GCN-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_lg_u32 s2, 0
+; GCN-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GCN-NEXT:    s_cbranch_scc1 .LBB8_3
+; GCN-NEXT:  ; %bb.2: ; %if_uniform
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT:  .LBB8_3: ; %endif
+; GCN-NEXT:    s_wait_xcnt 0x0
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GCN-NEXT:    s_sleep 5
+; GCN-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %endif
+
+if:
+  store i32 0, ptr addrspace(1) %out
+  %u_cmp = icmp eq i32 %cond, 0
+  br i1 %u_cmp, label %if_uniform, label %endif
+
+if_uniform:
+  store i32 1, ptr addrspace(1) %out
+  br label %endif
+
+endif:
+  ; layout can remove the split branch if it can copy the return block.
+  ; This call makes the return block long enough that it doesn't get copied.
+  call void @llvm.amdgcn.s.sleep(i32 5);
+  ret void
+}
+
+; si_mask_branch
+
+define amdgpu_kernel void @analyze_mask_branch() #0 {
+; GCN-LABEL: analyze_mask_branch:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s0, exec_lo
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_mov_b32_e64 v0, 0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_cmpx_nlt_f32_e32 0, v0
+; GCN-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execz .LBB9_2
+; GCN-NEXT:  ; %bb.1: ; %ret
+; GCN-NEXT:    v_mov_b32_e32 v0, 7
+; GCN-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_storecnt 0x0
+; GCN-NEXT:  .LBB9_2: ; %Flow1
+; GCN-NEXT:    s_wait_xcnt 0x0
+; GCN-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GCN-NEXT:    s_cbranch_execnz .LBB9_3
+; GCN-NEXT:  ; %bb.6: ; %Flow1
+; GCN-NEXT:    s_add_pc_i64 .LBB9_5-.Lpost_addpc10
+; GCN-NEXT:  .Lpost_addpc10:
+; GCN-NEXT:  .LBB9_3: ; %loop.preheader
+; GCN-NEXT:    s_mov_b32 vcc_lo, 0
+; GCN-NEXT:  .LBB9_4: ; %loop
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_cbranch_vccnz .LBB9_5
+; GCN-NEXT:  ; %bb.8: ; %loop
+; GCN-NEXT:    ; in Loop: Header=BB9_4 Depth=1
+; GCN-NEXT:    s_add_pc_i64 .LBB9_4-.Lpost_addpc11
+; GCN-NEXT:  .Lpost_addpc11:
+; GCN-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
+; GCN-NEXT:    s_endpgm
+entry:
+  %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
+  %cmp0 = fcmp ogt float %reg, 0.000000e+00
+  br i1 %cmp0, label %loop, label %ret
+
+loop:
+  %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
+  call void asm sideeffect
+  "v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  %cmp1 = fcmp olt float %phi, 8.0
+  br i1 %cmp1, label %loop_body, label %ret
+
+loop_body:
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %loop
+
+ret:
+  store volatile i32 7, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
+; GCN-LABEL: long_branch_hang:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GCN-NEXT:    s_mov_b32 s7, -1
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_cmp_eq_u32 s0, 0
+; GCN-NEXT:    s_cselect_b32 s6, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_cselect_b32 s8, -1, 0
+; GCN-NEXT:    s_cmp_lt_i32 s3, 6
+; GCN-NEXT:    s_cbranch_scc0 .LBB10_1
+; GCN-NEXT:  ; %bb.10: ; %bb
+; GCN-NEXT:    s_add_pc_i64 .LBB10_4-.Lpost_addpc13
+; GCN-NEXT:  .Lpost_addpc13:
+; GCN-NEXT:  .LBB10_1: ; %Flow
+; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
+; GCN-NEXT:    s_cbranch_vccnz .LBB10_2
+; GCN-NEXT:  ; %bb.12: ; %Flow
+; GCN-NEXT:    s_add_pc_i64 .LBB10_5-.Lpost_addpc14
+; GCN-NEXT:  .Lpost_addpc14:
+; GCN-NEXT:  .LBB10_2: ; %Flow5
+; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_vccz .LBB10_3
+; GCN-NEXT:  ; %bb.14: ; %Flow5
+; GCN-NEXT:    s_add_pc_i64 .LBB10_6-.Lpost_addpc15
+; GCN-NEXT:  .Lpost_addpc15:
+; GCN-NEXT:  .LBB10_3: ; %bb14
+; GCN-NEXT:    s_cmp_lt_i32 s1, 9
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:    s_cmp_lt_i32 s2, s3
+; GCN-NEXT:    s_cselect_b32 s1, -1, 0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_and_b32 s0, s6, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:  ; %bb.8: ; %bb14
+; GCN-NEXT:    s_add_pc_i64 .LBB10_7-.Lpost_addpc12
+; GCN-NEXT:  .Lpost_addpc12:
+; GCN-NEXT:  .LBB10_4: ; %bb13
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_mov_b32 s0, s8
+; GCN-NEXT:    s_sleep 0
+; GCN-NEXT:    s_cbranch_execz .LBB10_5
+; GCN-NEXT:  ; %bb.16: ; %bb13
+; GCN-NEXT:    s_add_pc_i64 .LBB10_2-.Lpost_addpc16
+; GCN-NEXT:  .Lpost_addpc16:
+; GCN-NEXT:  .LBB10_5: ; %bb9
+; GCN-NEXT:    s_cmp_lt_i32 s3, 11
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:    s_cmp_ge_i32 s2, s3
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_b32 s0, s7, s0
+; GCN-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_vccnz .LBB10_6
+; GCN-NEXT:  ; %bb.18: ; %bb9
+; GCN-NEXT:    s_add_pc_i64 .LBB10_3-.Lpost_addpc17
+; GCN-NEXT:  .Lpost_addpc17:
+; GCN-NEXT:  .LBB10_6:
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:  .LBB10_7: ; %bb19
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
+; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GCN-NEXT:    s_wait_alu 0xfffe
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GCN-NEXT:    s_endpgm
+bb:
+  %tmp = icmp slt i32 %arg2, 9
+  %tmp6 = icmp eq i32 %arg1, 0
+  %tmp8 = icmp sgt i32 %arg4, 5
+  br i1 %tmp8, label %bb9, label %bb13
+
+bb9:                                              ; preds = %bb
+  %tmp7 = icmp sgt i32 %arg4, 10                  ; avoid being optimized away through the domination
+  %tmp11 = icmp slt i32 %arg3, %arg4
+  %tmp12 = or i1 %tmp11, %tmp7
+  br i1 %tmp12, label %bb19, label %bb14
+
+bb13:                                             ; preds = %bb
+  call void asm sideeffect
+  "v_nop_e64
+  v_nop_e64", ""() #0
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br i1 %tmp6, label %bb19, label %bb14
+
+bb14:                                             ; preds = %bb13, %bb9
+  %tmp15 = icmp slt i32 %arg3, %arg4
+  %tmp16 = or i1 %tmp15, %tmp
+  %tmp17 = and i1 %tmp6, %tmp16
+  %tmp18 = zext i1 %tmp17 to i32
+  br label %bb19
+
+bb19:                                             ; preds = %bb14, %bb13, %bb9
+  %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
+  %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %arg5
+  store i32 %tmp20, ptr addrspace(1) %tmp21, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index c69e12731e10d..3c991cfb7a1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
-; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a0, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a1, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a2, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a3, v10 ; Reload Reuse
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
@@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT:    s_nop 0
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(2)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a4, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a5, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a6, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a7, v10 ; Reload Reuse
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
@@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a4 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a5 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a6 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a7 ; Reload Reuse
+; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    s_cbranch_vccnz .LBB0_1
 ; GISEL-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
@@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT:    v_add_u32_e32 v62, s8, v0
-; SDAG-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
-; SDAG-GFX942-NEXT:    s_and_b64 vcc, exec, vcc
-; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a0, v13 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a1, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a2, v11 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a3, v10 ; Reload Reuse
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
@@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
 ; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT:    s_nop 0
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT:    v_add_u32_e32 v62, s8, v0
+; SDAG-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
+; SDAG-GFX942-NEXT:    s_and_b64 vcc, exec, vcc
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v63, a3 ; Reload Reuse
+; SDAG-GFX942-NEXT:    scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(2)
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a4, v13 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a5, v12 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a6, v11 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a7, v10 ; Reload Reuse
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
@@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v5, a4 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v4, a5 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v3, a6 ; Reload Reuse
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v2, a7 ; Reload Reuse
+; SDAG-GFX942-NEXT:    scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
 ; SDAG-GFX942-NEXT:    s_cbranch_vccnz .LBB1_1
 ; SDAG-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
@@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s12, v0
-; GISEL-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
-; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], vcc, -1
-; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; GISEL-GFX942-NEXT:    s_and_b64 vcc, s[2:3], exec
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a0, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a1, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a2, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a3, v10 ; Reload Reuse
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
@@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT:    s_nop 0
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s12, v0
+; GISEL-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
+; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; GISEL-GFX942-NEXT:    s_and_b64 vcc, s[2:3], exec
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v63, a3 ; Reload Reuse
+; GISEL-GFX942-NEXT:    scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(2)
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a4, v13 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a2 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a3 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a5, v12 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a6, v11 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_write_b32 a7, v10 ; Reload Reuse
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
@@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v5, a4 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v4, a5 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v3, a6 ; Reload Reuse
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v2, a7 ; Reload Reuse
+; GISEL-GFX942-NEXT:    scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GISEL-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
new file mode 100644
index 0000000000000..d0c9740c6954e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
+---
+name: snork
+body:  |
+  bb.0:
+    ; CHECK-LABEL: name: snork
+    ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+    ; CHECK-NEXT: SI_RETURN
+    %0:sreg_32 = S_MOV_B32 0
+    %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
+    %2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc
+    SI_RETURN
+...
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index ac03d2dae8fa8..dea9142cf2bee 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -1,8 +1,10 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX1100,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX1150,NOT-GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
 declare float @llvm.fabs.f32(float)
 declare float @llvm.fma.f32(float, float, float)
@@ -35,11 +37,19 @@ define float @v_mul_f32_vop2(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, %y
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_mul_f32_vop2_inline_imm(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_inline_imm:
@@ -69,11 +79,19 @@ define float @v_mul_f32_vop2_inline_imm(float %x) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2_inline_imm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, 4.0
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_mul_f32_vop2_literal(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_literal:
@@ -103,11 +121,19 @@ define float @v_mul_f32_vop2_literal(float %x) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2_literal:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %mul = fmul float %x, 123.0
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods:
@@ -137,12 +163,20 @@ define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop3_src_mods:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, %y
   ret float %mul
 }
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
@@ -172,6 +206,13 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, 4.0
   ret float %mul
@@ -179,6 +220,7 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
 
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX9-LABEL: v_mul_f32_vop3_src_mods_literal:
@@ -209,6 +251,13 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop3_src_mods_literal:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %fabs.x, 123.0
   ret float %mul
@@ -218,6 +267,7 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 20
 ; GFX1200: codeLenInByte = 36
+; GFX1250: codeLenInByte = 24
 
 define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX9-LABEL: v_mul_f32_vop2_frame_index:
@@ -249,6 +299,13 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f32_vop2_frame_index:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %alloca = alloca i32, addrspace(5)
   %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
   %cast = bitcast i32 %ptrtoint to float
@@ -260,6 +317,7 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX9-LABEL: v_fma_f32:
@@ -289,12 +347,20 @@ define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fma_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
 }
 
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 ; GFX9-LABEL: v_fma_f32_src_mods:
@@ -324,6 +390,13 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fma_f32_src_mods:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
   ret float %fma
@@ -331,6 +404,7 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
 
 ; NOT-GFX12: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_fmac_f32(float %x, float %y) {
 ; GFX9-LABEL: v_fmac_f32:
@@ -360,6 +434,13 @@ define float @v_fmac_f32(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fmac_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %x)
   ret float %fma
 }
@@ -368,6 +449,7 @@ define float @v_fmac_f32(float %x, float %y) {
 ; GFX10: codeLenInByte = 12
 ; GFX11: codeLenInByte = 12
 ; GFX1200: codeLenInByte = 28
+; GFX1250: codeLenInByte = 16
 
 define float @v_fmaak_f32(float %x, float %y) {
 ; GFX9-LABEL: v_fmaak_f32:
@@ -398,6 +480,13 @@ define float @v_fmaak_f32(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fmaak_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
   ret float %fma
 }
@@ -406,6 +495,7 @@ define float @v_fmaak_f32(float %x, float %y) {
 ; GFX10: codeLenInByte = 16
 ; GFX11: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 32
+; GFX1250: codeLenInByte = 20
 
 define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX9-LABEL: v_fma_k_f32_src_mods:
@@ -436,6 +526,13 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
 ; GFX1200-NEXT:    v_fma_f32 v0, |v0|, v1, 0x43800000 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0xfe,0x03,0x00,0x00,0x80,0x43]
 ; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_fma_k_f32_src_mods:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_fma_f32 v0, |v0|, v1, 0x43800000 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0xfe,0x03,0x00,0x00,0x80,0x43]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float 256.0)
   ret float %fma
@@ -445,6 +542,7 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
 ; GFX10: codeLenInByte = 20
 ; GFX11: codeLenInByte = 20
 ; GFX1200: codeLenInByte = 36
+; GFX1250: codeLenInByte = 24
 
 define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ; GFX9-LABEL: s_fmaak_f32:
@@ -480,6 +578,13 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) ; encoding: [0x0b,0x00,0x87,0xbf]
 ; GFX1200-NEXT:    v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
 ; GFX1200-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_fmaak_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_fmaak_f32 s0, s0, s1, 0x43800000 ; encoding: [0x00,0x01,0x80,0xa2,0x00,0x00,0x80,0x43]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) ; encoding: [0x0b,0x00,0x87,0xbf]
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
+; GFX1250-NEXT:    ; return to shader part epilog
   %fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
   ret float %fma
 }
@@ -489,3 +594,212 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ; GFX1100: codeLenInByte = 16
 ; GFX1150: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 16
+; GFX1250: codeLenInByte = 16
+
+define double @v_mul_f64_vop2_literal_32(double %x) {
+; GFX9-LABEL: v_mul_f64_vop2_literal_32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT:    s_mov_b32 s4, 0 ; encoding: [0x80,0x00,0x84,0xbe]
+; GFX9-NEXT:    s_mov_b32 s5, 0x405ec000 ; encoding: [0xff,0x00,0x85,0xbe,0x00,0xc0,0x5e,0x40]
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5] ; encoding: [0x00,0x00,0x81,0xd2,0x00,0x09,0x00,0x00]
+; GFX9-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; GFX10-LABEL: v_mul_f64_vop2_literal_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x405ec000, v[0:1] ; encoding: [0x00,0x00,0x65,0xd5,0xff,0x00,0x02,0x00,0x00,0xc0,0x5e,0x40]
+; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX11-LABEL: v_mul_f64_vop2_literal_32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x405ec000, v[0:1] ; encoding: [0x00,0x00,0x28,0xd7,0xff,0x00,0x02,0x00,0x00,0xc0,0x5e,0x40]
+; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f64_vop2_literal_32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_mul_f64_e32 v[0:1], 0x405ec000, v[0:1] ; encoding: [0xff,0x00,0x00,0x0c,0x00,0xc0,0x5e,0x40]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f64_vop2_literal_32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f64_e32 v[0:1], 0x405ec000, v[0:1] ; encoding: [0xff,0x00,0x00,0x0c,0x00,0xc0,0x5e,0x40]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+  %mul = fmul double %x, 123.0
+  ret double %mul
+}
+
+; GFX9: codeLenInByte = 28
+; GFX10: codeLenInByte = 20
+; GFX1100: codeLenInByte = 20
+; GFX1150: codeLenInByte = 20
+; GFX1250: codeLenInByte = 20
+
+define double @v_mul_f64_vop2_literal_64(double %x) {
+; GFX9-LABEL: v_mul_f64_vop2_literal_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT:    s_mov_b32 s4, 0x66666666 ; encoding: [0xff,0x00,0x84,0xbe,0x66,0x66,0x66,0x66]
+; GFX9-NEXT:    s_mov_b32 s5, 0x405ec666 ; encoding: [0xff,0x00,0x85,0xbe,0x66,0xc6,0x5e,0x40]
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5] ; encoding: [0x00,0x00,0x81,0xd2,0x00,0x09,0x00,0x00]
+; GFX9-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; GFX10-LABEL: v_mul_f64_vop2_literal_64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT:    s_mov_b32 s4, 0x66666666 ; encoding: [0xff,0x03,0x84,0xbe,0x66,0x66,0x66,0x66]
+; GFX10-NEXT:    s_mov_b32 s5, 0x405ec666 ; encoding: [0xff,0x03,0x85,0xbe,0x66,0xc6,0x5e,0x40]
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5] ; encoding: [0x00,0x00,0x65,0xd5,0x00,0x09,0x00,0x00]
+; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX11-LABEL: v_mul_f64_vop2_literal_64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-NEXT:    s_mov_b32 s0, 0x66666666 ; encoding: [0xff,0x00,0x80,0xbe,0x66,0x66,0x66,0x66]
+; GFX11-NEXT:    s_mov_b32 s1, 0x405ec666 ; encoding: [0xff,0x00,0x81,0xbe,0x66,0xc6,0x5e,0x40]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1] ; encoding: [0x00,0x00,0x28,0xd7,0x00,0x01,0x00,0x00]
+; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_mul_f64_vop2_literal_64:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    s_mov_b32 s0, 0x66666666 ; encoding: [0xff,0x00,0x80,0xbe,0x66,0x66,0x66,0x66]
+; GFX1200-NEXT:    s_mov_b32 s1, 0x405ec666 ; encoding: [0xff,0x00,0x81,0xbe,0x66,0xc6,0x5e,0x40]
+; GFX1200-NEXT:    s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
+; GFX1200-NEXT:    v_mul_f64_e32 v[0:1], s[0:1], v[0:1] ; encoding: [0x00,0x00,0x00,0x0c]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_mul_f64_vop2_literal_64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_mul_f64_e32 v[0:1], lit64(0x405ec66666666666), v[0:1] ; encoding: [0xfe,0x00,0x00,0x0c,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+  %mul = fmul double %x, 123.1
+  ret double %mul
+}
+
+; GFX9: codeLenInByte = 32
+; GFX10: codeLenInByte = 32
+; GFX1100: codeLenInByte = 36
+; GFX1150: codeLenInByte = 36
+; GFX1250: codeLenInByte = 24
+
+define i64 @v_add_u64_vop2_literal_32(i64 %x) {
+; GFX9-LABEL: v_add_u64_vop2_literal_32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7b, v0 ; encoding: [0xff,0x00,0x00,0x32,0x7b,0x00,0x00,0x00]
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; encoding: [0x80,0x02,0x02,0x38]
+; GFX9-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; GFX10-LABEL: v_add_u64_vop2_literal_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x0f,0xd7,0xff,0x00,0x02,0x00,0x7b,0x00,0x00,0x00]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; encoding: [0x01,0x7d,0x28,0xd5,0x80,0x02,0xaa,0x01]
+; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX11-LABEL: v_add_u64_vop2_literal_32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x00,0x7b,0x00,0x00,0x00]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; encoding: [0x01,0x7c,0x20,0xd5,0x80,0x02,0xaa,0x01]
+; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_add_u64_vop2_literal_32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7b, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x00,0x7b,0x00,0x00,0x00]
+; GFX1200-NEXT:    s_wait_alu 0xfffd ; encoding: [0xfd,0xff,0x88,0xbf]
+; GFX1200-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; encoding: [0x01,0x7c,0x20,0xd5,0x80,0x02,0xaa,0x01]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_add_u64_vop2_literal_32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 0x7b ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0xfd,0x03,0x7b,0x00,0x00,0x00]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+  %add = add i64 %x, 123
+  ret i64 %add
+}
+
+; GFX9: codeLenInByte = 20
+; GFX10: codeLenInByte = 28
+; GFX1100: codeLenInByte = 32
+; GFX1150: codeLenInByte = 32
+; GFX1250: codeLenInByte = 24
+
+define i64 @v_add_u64_vop2_literal_64(i64 %x) {
+; GFX9-LABEL: v_add_u64_vop2_literal_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x12345678, v0 ; encoding: [0xff,0x00,0x00,0x32,0x78,0x56,0x34,0x12]
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc ; encoding: [0x81,0x02,0x02,0x38]
+; GFX9-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; GFX10-LABEL: v_add_u64_vop2_literal_64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x12345678, v0 ; encoding: [0x00,0x6a,0x0f,0xd7,0xff,0x00,0x02,0x00,0x78,0x56,0x34,0x12]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, v1, vcc_lo ; encoding: [0x01,0x7d,0x28,0xd5,0x81,0x02,0xaa,0x01]
+; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX11-LABEL: v_add_u64_vop2_literal_64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x12345678, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x00,0x78,0x56,0x34,0x12]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, v1, vcc_lo ; encoding: [0x01,0x7c,0x20,0xd5,0x81,0x02,0xaa,0x01]
+; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1200-LABEL: v_add_u64_vop2_literal_64:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1200-NEXT:    s_wait_expcnt 0x0 ; encoding: [0x00,0x00,0xc4,0xbf]
+; GFX1200-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX1200-NEXT:    s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
+; GFX1200-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1200-NEXT:    v_add_co_u32 v0, vcc_lo, 0x12345678, v0 ; encoding: [0x00,0x6a,0x00,0xd7,0xff,0x00,0x02,0x00,0x78,0x56,0x34,0x12]
+; GFX1200-NEXT:    s_wait_alu 0xfffd ; encoding: [0xfd,0xff,0x88,0xbf]
+; GFX1200-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, v1, vcc_lo ; encoding: [0x01,0x7c,0x20,0xd5,0x81,0x02,0xaa,0x01]
+; GFX1200-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX1250-LABEL: v_add_u64_vop2_literal_64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf]
+; GFX1250-NEXT:    s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
+; GFX1250-NEXT:    s_mov_b64 s[0:1], lit64(0x112345678) ; encoding: [0xfe,0x01,0x80,0xbe,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00]
+; GFX1250-NEXT:    s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
+; GFX1250-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] ; encoding: [0x00,0x00,0x52,0xd6,0x00,0x01,0x01,0x00]
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+  %add = add i64 %x, 4600387192
+  ret i64 %add
+}
+
+; GFX9: codeLenInByte = 20
+; GFX10: codeLenInByte = 28
+; GFX1100: codeLenInByte = 32
+; GFX1150: codeLenInByte = 32
+; GFX1250: codeLenInByte = 36
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOT-GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index f8e13fcdd2273..4cb0d2d7b3789 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -521,16 +521,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
@@ -2710,16 +2713,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
new file mode 100644
index 0000000000000..f54fbbaabe9f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -0,0 +1,2223 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test using saddr addressing mode of flat_*load_* instructions.
+
+; --------------------------------------------------------------------------------
+; No vgpr offset, constants
+; --------------------------------------------------------------------------------
+
+; SGPR base only
+define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %load = load i8, ptr %sbase
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx1250 immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_8388607:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx1250 immediate offset + 1
+define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_8388608:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388608
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx1250 immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_neg8388608:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388608
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388608
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx1250 immediate offset -1
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg8388609:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xff800000, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg8388609:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xff7fffff
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388609
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0xff800000
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000000:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000000:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967296
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000001:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000001:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967297
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfff
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971391
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100001000:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100001000:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0x1000
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971392
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x800000, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388607
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967295
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000000:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, -1
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000000:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967296
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000001:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-1
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000001:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, -1
+; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967297
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Basic addressing patterns
+; --------------------------------------------------------------------------------
+
+; Basic pattern, no immediate offset.
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum positive offset on gfx1250
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum positive offset on gfx1250 + 1
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388608
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum negative offset on gfx1250
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388608:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388608
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388608
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Maximum negative offset on gfx1250 - 1
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388607:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388607
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 %zext.offset
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; pointer addressing done in integers
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %sbase.as.int, %zext.offset
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; zext forced to LHS of addressing expression
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; zext forced to LHS of addressing expression, with immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
+define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add.immoffset = add i64 %sbase.as.int, 128
+  %add = add i64 %zext.offset, %add.immoffset
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Uniformity edge cases
+; --------------------------------------------------------------------------------
+
+@ptr.in.lds = internal addrspace(3) global ptr undef
+
+; Base pointer is uniform, but also in VGPRs
+define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Base pointer is uniform, but also in VGPRs, with imm offset
+define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[0:1] offset:42
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:42
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both 64-bit base and 32-bit offset are scalar
+define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
+define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-24
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both components uniform, zext forced to LHS of addressing expression
+define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) {
+; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr
+  %load = load i8, ptr %dirty.gep
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; divergent 64-bit base, 32-bit scalar offset.
+define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; divergent 64-bit base, 32-bit scalar offset, with imm offset
+define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i32 inreg %soffset) {
+; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607
+  %load = load i8, ptr %gep1
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Natural addressing shifts with restricted range
+; --------------------------------------------------------------------------------
+
+; Cannot push the shift into 32-bits, and cannot match.
+define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
+; GFX1250-SDAG-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep
+  ret float %load
+}
+
+; Cannot push the shift into 32-bits, with an immediate offset.
+define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing_immoffset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 128
+  %load = load float, ptr %gep1
+  ret float %load
+}
+
+; Range is sufficiently restricted to push the shift into 32-bits.
+define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep
+  ret float %load
+}
+
+; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
+define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:400
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds float, ptr %gep0, i64 100
+  %load = load float, ptr %gep1
+  ret float %load
+}
+
+; Range is 1 beyond the limit where we can move the shift into 32-bits.
+define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) {
+; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
+; GFX1250-SDAG-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep
+  ret float %load
+}
+
+; --------------------------------------------------------------------------------
+; Stress various type loads
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %cast.load = bitcast i16 %load to half
+  ret half %cast.load
+}
+
+define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %cast.load = bitcast i16 %load to half
+  ret half %cast.load
+}
+
+define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load half, ptr %gep0
+  ret half %load
+}
+
+define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load half, ptr %gep1
+  ret half %load
+}
+
+define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i32, ptr %gep0
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i32, ptr %gep1
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load float, ptr %gep0
+  ret float %load
+}
+
+define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load float, ptr %gep1
+  ret float %load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x i16>, ptr %gep0
+  %cast.load = bitcast <2 x i16> %load to <2 x half>
+  ret <2 x half> %cast.load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x i16>, ptr %gep1
+  %cast.load = bitcast <2 x i16> %load to <2 x half>
+  ret <2 x half> %cast.load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x half>, ptr %gep0
+  ret <2 x half> %load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x half>, ptr %gep1
+  ret <2 x half> %load
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load ptr addrspace(3), ptr %gep0
+  %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
+  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
+  ret <2 x half> %cast.load1
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p3_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load ptr addrspace(3), ptr %gep1
+  %cast.load0 = ptrtoint ptr addrspace(3) %load to i32
+  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
+  ret <2 x half> %cast.load1
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load double, ptr %gep0
+  %cast.load = bitcast double %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_f64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load double, ptr %gep1
+  %cast.load = bitcast double %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i64, ptr %gep0
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i64, ptr %gep1
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x float>, ptr %gep0
+  ret <2 x float> %load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x float>, ptr %gep1
+  ret <2 x float> %load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x i32>, ptr %gep0
+  %cast.load = bitcast <2 x i32> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x i32>, ptr %gep1
+  %cast.load = bitcast <2 x i32> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x i16>, ptr %gep0
+  %cast.load = bitcast <4 x i16> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x i16>, ptr %gep1
+  %cast.load = bitcast <4 x i16> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x half>, ptr %gep0
+  %cast.load = bitcast <4 x half> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x half>, ptr %gep1
+  %cast.load = bitcast <4 x half> %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load ptr, ptr %gep0
+  %cast.load0 = ptrtoint ptr %load to i64
+  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
+  ret <2 x float> %cast.load1
+}
+
+define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_p1_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load ptr, ptr %gep1
+  %cast.load0 = ptrtoint ptr %load to i64
+  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
+  ret <2 x float> %cast.load1
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <3 x float>, ptr %gep0
+  ret <3 x float> %load
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <3 x float>, ptr %gep1
+  ret <3 x float> %load
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <3 x i32>, ptr %gep0
+  %cast.load = bitcast <3 x i32> %load to <3 x float>
+  ret <3 x float> %cast.load
+}
+
+define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v3i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <3 x i32>, ptr %gep1
+  %cast.load = bitcast <3 x i32> %load to <3 x float>
+  ret <3 x float> %cast.load
+}
+
+define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v6f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <6 x half>, ptr %gep0
+  ret <6 x half> %load
+}
+
+define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v6f16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <6 x half>, ptr %gep1
+  ret <6 x half> %load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x float>, ptr %gep0
+  ret <4 x float> %load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4f32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x float>, ptr %gep1
+  ret <4 x float> %load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x i32>, ptr %gep0
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x i32>, ptr %gep1
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x i64>, ptr %gep0
+  %cast.load = bitcast <2 x i64> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2i64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x i64>, ptr %gep1
+  %cast.load = bitcast <2 x i64> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i128, ptr %gep0
+  %cast.load = bitcast i128 %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i128_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i128, ptr %gep1
+  %cast.load = bitcast i128 %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2p1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <2 x ptr>, ptr %gep0
+  %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64>
+  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v2p1_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <2 x ptr>, ptr %gep1
+  %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64>
+  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4p3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load <4 x ptr addrspace(3)>, ptr %gep0
+  %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
+  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_v4p3_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load <4 x ptr addrspace(3)>, ptr %gep1
+  %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
+  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
+  ret <4 x float> %cast.load1
+}
+
+; --------------------------------------------------------------------------------
+; Extending loads
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %sextload = sext i8 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i8_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %sextload = sext i8 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %sextload = sext i16 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_sextload_saddr_i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_i16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %sextload = sext i16 %load to i32
+  %cast.load = bitcast i32 %sextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zextload = zext i8 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i8_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %zextload = zext i8 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %zextload = zext i16 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_zextload_saddr_i16_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %zextload = zext i16 %load to i32
+  %cast.load = bitcast i32 %zextload to float
+  ret float %cast.load
+}
+
+; --------------------------------------------------------------------------------
+; Atomic load
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load atomic i32, ptr %gep0 seq_cst, align 4
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i32_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load atomic i32, ptr %gep1 seq_cst, align 4
+  %cast.load = bitcast i32 %load to float
+  ret float %cast.load
+}
+
+define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load atomic i64, ptr %gep0 seq_cst, align 8
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: atomic_flat_load_saddr_i64_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load atomic i64, ptr %gep1 seq_cst, align 8
+  %cast.load = bitcast i64 %load to <2 x float>
+  ret <2 x float> %cast.load
+}
+
+; --------------------------------------------------------------------------------
+; D16 load (low 16)
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> undef, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> undef, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+; --------------------------------------------------------------------------------
+; D16 hi load (hi16)
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> undef, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> undef, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i16, ptr %gep0
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i16, ptr %gep1
+  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %zext.load = zext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %load = load i8, ptr %gep0
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %load = load i8, ptr %gep1
+  %sext.load = sext i8 %load to i16
+  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
+  %cast = bitcast <2 x i16> %build to <2 x half>
+  ret <2 x half> %cast
+}
+
+; --------------------------------------------------------------------------------
+; or-with-constant as add
+; --------------------------------------------------------------------------------
+
+; Check add-as-or with split 64-bit or.
+define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_bitop2_b32 v0, 16, v0 bitop3:0x54
+; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 16
+  %addr = inttoptr i64 %or to ptr
+  %load = load i8, ptr %addr
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 4160
+  %addr = inttoptr i64 %or to ptr
+  %load = load i8, ptr %addr
+  %zext = zext i8 %load to i32
+  %to.vgpr = bitcast i32 %zext to float
+  ret float %to.vgpr
+}
+
+; --------------------------------------------------------------------------------
+; Full 64-bit scalar add.
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
+; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv:
+; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT:  .LBB116_1: ; %bb3
+; GFX1250-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT:    s_cbranch_scc0 .LBB116_1
+; GFX1250-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv:
+; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT:  .LBB116_1: ; %bb3
+; GFX1250-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT:    flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT:    s_cbranch_vccz .LBB116_1
+; GFX1250-GISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-GISEL-NEXT:    s_endpgm
+bb:
+  br label %bb3
+
+bb2:                                              ; preds = %bb3
+  ret void
+
+bb3:                                              ; preds = %bb3, %bb
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr %arg, i64 %i4
+  %i6 = load volatile float, ptr %i5, align 4
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb3
+}
+
+; Make sure we only have a single zero vaddr initialization.
+
+define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inreg %arg.1) {
+; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload:
+; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT:  .LBB117_1: ; %bb3
+; GFX1250-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[4:5], s[2:3], s[0:1]
+; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 4
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT:    s_cbranch_scc0 .LBB117_1
+; GFX1250-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload:
+; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT:  .LBB117_1: ; %bb3
+; GFX1250-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX1250-GISEL-NEXT:    s_wait_alu 0xfffd
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, 4
+; GFX1250-GISEL-NEXT:    flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT:    s_cbranch_vccz .LBB117_1
+; GFX1250-GISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX1250-GISEL-NEXT:    s_endpgm
+bb:
+  br label %bb3
+
+bb2:                                              ; preds = %bb3
+  ret void
+
+bb3:                                              ; preds = %bb3, %bb
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr %arg, i64 %i4
+  %i6 = load volatile float, ptr %i5, align 4
+  %i5.1 = getelementptr inbounds float, ptr %arg.1, i64 %i4
+  %i6.1 = load volatile float, ptr %i5, align 4
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb3
+}
+
+!0 = !{i32 0, i32 1073741824} ; (1 << 30)
+!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
new file mode 100644
index 0000000000000..32888d2acf1cd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
@@ -0,0 +1,1118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test using saddr addressing mode of flat_*store_* instructions.
+
+define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
+; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i8 %data, ptr %gep0
+  ret void
+}
+
+; Maximum positive offset on gfx10
+define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
+; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_2047:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3] offset:2047
+; GFX1250-NEXT:    s_endpgm
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047
+  store i8 %data, ptr %gep1
+  ret void
+}
+
+; Maximum negative offset on gfx10
+define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
+; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_neg2048:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3] offset:-2048
+; GFX1250-NEXT:    s_endpgm
+  %voffset = load i32, ptr %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048
+  store i8 %data, ptr %gep1
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Uniformity edge cases
+; --------------------------------------------------------------------------------
+
+@ptr.in.lds = internal addrspace(3) global ptr undef
+
+; Base pointer is uniform, but also in VGPRs
+define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_store_b8 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_store_b8 v[2:3], v1
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i8 %data, ptr %gep0
+  ret void
+}
+
+; Base pointer is uniform, but also in VGPRs, with imm offset
+define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT:    flat_store_b8 v0, v1, s[0:1] offset:-120
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT:    flat_store_b8 v[2:3], v1 offset:-120
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %sbase = load ptr, ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -120
+  store i8 %data, ptr %gep1
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Stress various type stores
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, i16 %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i16 %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i16 %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store i16 %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, half %data) {
+; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store half %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, half %data) {
+; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store half %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i32 %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store i32 %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, float %data) {
+; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store float %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, float %data) {
+; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store float %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
+; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store ptr addrspace(3) %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
+; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store ptr addrspace(3) %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store i64 %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store i64 %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, double %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store double %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, double %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store double %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x i32> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x i32> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x float> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x float> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x i16> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x i16> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x half> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x half> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store ptr %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store ptr %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <3 x i32> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <3 x i32> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <3 x float> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <3 x float> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <6 x i16> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <6 x i16> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <6 x half> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <6 x half> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x i32> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x i32> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x float> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x float> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x float> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x i64> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x i64> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x double> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x double> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x double> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x double> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <8 x i16> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <8 x i16> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <8 x half> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x half> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <8 x half> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <2 x ptr> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <2 x ptr> %data, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store <4 x ptr addrspace(3)> %data, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
+; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store <4 x ptr addrspace(3)> %data, ptr %gep1
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; Atomic store
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store atomic i32 %data, ptr %gep0 seq_cst, align 4
+  ret void
+}
+
+define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
+; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store atomic i32 %data, ptr %gep1 seq_cst, align 4
+  ret void
+}
+
+define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  store atomic i64 %data, ptr %gep0 seq_cst, align 8
+  ret void
+}
+
+define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
+; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT:    s_endpgm
+;
+; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  store atomic i64 %data, ptr %gep1 seq_cst, align 8
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; D16 HI store (hi 16)
+; --------------------------------------------------------------------------------
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b16 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  store i16 %data.hi, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  store i16 %data.hi, ptr %gep1
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b8 v0, v1, s[2:3]
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  %data.hi.trunc = trunc i16 %data.hi to i8
+  store i8 %data.hi.trunc, ptr %gep0
+  ret void
+}
+
+define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
+; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT:    s_endpgm
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
+  %data.hi = extractelement <2 x i16> %data, i32 1
+  %data.hi.trunc = trunc i16 %data.hi to i8
+  store i8 %data.hi.trunc, ptr %gep1
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
index 99f7d4da685d6..e4488258dcf88 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
@@ -1,7 +1,8 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SIGFX11 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SIGFX11 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX90A %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SIGFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMAC_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,FMA_F64 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=FUNC,GCN,FMAC_F64 %s
 
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
@@ -9,8 +10,8 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) n
 declare double @llvm.fabs.f64(double) nounwind readnone
 
 ; FUNC-LABEL: {{^}}fma_f64:
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
                      ptr addrspace(1) %in2, ptr addrspace(1) %in3) {
    %r0 = load double, ptr addrspace(1) %in1
@@ -22,10 +23,10 @@ define amdgpu_kernel void @fma_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 }
 
 ; FUNC-LABEL: {{^}}fma_v2f64:
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
                        ptr addrspace(1) %in2, ptr addrspace(1) %in3) {
    %r0 = load <2 x double>, ptr addrspace(1) %in1
@@ -37,14 +38,14 @@ define amdgpu_kernel void @fma_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in
 }
 
 ; FUNC-LABEL: {{^}}fma_v4f64:
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
                        ptr addrspace(1) %in2, ptr addrspace(1) %in3) {
    %r0 = load <4 x double>, ptr addrspace(1) %in1
@@ -176,8 +177,8 @@ define amdgpu_kernel void @fma_f64_abs_neg_src2(ptr addrspace(1) %out, ptr addrs
 }
 
 ; FUNC-LABEL: {{^}}fma_f64_lit_src0:
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_lit_src0(ptr addrspace(1) %out,
                      ptr addrspace(1) %in2, ptr addrspace(1) %in3) {
    %r1 = load double, ptr addrspace(1) %in2
@@ -188,8 +189,8 @@ define amdgpu_kernel void @fma_f64_lit_src0(ptr addrspace(1) %out,
 }
 
 ; FUNC-LABEL: {{^}}fma_f64_lit_src1:
-; SIGFX11: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
-; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
+; FMA_F64: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
+; FMAC_F64: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @fma_f64_lit_src1(ptr addrspace(1) %out, ptr addrspace(1) %in1,
                      ptr addrspace(1) %in3) {
    %r0 = load double, ptr addrspace(1) %in1
diff --git a/llvm/test/CodeGen/AMDGPU/fold-commute-sgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-commute-sgpr.mir
new file mode 100644
index 0000000000000..c6bc248f13388
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-commute-sgpr.mir
@@ -0,0 +1,24 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: fold_commute_sgprs
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: fold_commute_sgprs
+    ; CHECK: liveins: $sgpr0, $sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; CHECK-NEXT: [[V_ADD_NC_U16_fake16_e64_dpp:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, 280, 15, 15, 1, implicit $exec
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = IMPLICIT_DEF
+    %2:vgpr_32 = COPY %1:sreg_32
+    %3:vgpr_32 = COPY %0:sreg_32
+    %4:sreg_32 = COPY $sgpr1
+    %5:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %2:vgpr_32, 0, %3:vgpr_32, 0, %4:sreg_32, 0, 0, 280, 15, 15, 1, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index ac438062ae208..9a347d71bf430 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -14592,5 +14592,241 @@ define void @freeze_v4i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
   store <4 x i1> %freeze, ptr addrspace(1) %ptrb
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX8-SDAG: {{.*}}
+
+define double @freeze_fabs_double(float %a, double %b, double %c) {
+; GFX6-SDAG-LABEL: freeze_fabs_double:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX6-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX6-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_fabs_double:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX6-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX6-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX6-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_fabs_double:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX7-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX7-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_fabs_double:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX7-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX7-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX7-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: freeze_fabs_double:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX8-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX8-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_fabs_double:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX8-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX8-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX8-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_fabs_double:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX9-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: freeze_fabs_double:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX10-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_fabs_double:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX10-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_fabs_double:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], |v[4:5]|, v[1:2]
+; GFX11-SDAG-NEXT:    v_add_f64 v[2:3], |v[4:5]|, v[3:4]
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_fabs_double:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[4:5], v[1:2]
+; GFX11-GISEL-NEXT:    v_add_f64 v[2:3], v[4:5], v[3:4]
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %pv = insertelement <2 x float> poison, float %a, i32 1
+  %d = bitcast <2 x float> %pv to double
+  %r = call double @llvm.fabs.f64(double %d)
+  %fr = freeze double %r
+  %add1 = fadd double %fr, %b
+  %add2 = fadd double %fr, %c
+  %add = fadd double %add1, %add2
+  ret double %add
+}
+
+define <4 x float> @freeze_fabs_v4float(<4 x float> %A, <4 x float> %B) {
+; GFX6-SDAG-LABEL: freeze_fabs_v4float:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_fabs_v4float:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_fabs_v4float:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_fabs_v4float:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX7-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: freeze_fabs_v4float:
+; GFX8-SDAG:       ; %bb.0:
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX8-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_fabs_v4float:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_fabs_v4float:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: freeze_fabs_v4float:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_fabs_v4float:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_fabs_v4float:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_fabs_v4float:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %A0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %A)
+  %F1 = freeze <4 x float> %A0
+  %A1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %F1)
+  ret <4 x float> %A1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global-address.ll b/llvm/test/CodeGen/AMDGPU/global-address.ll
new file mode 100644
index 0000000000000..60f4f0c762cf6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-address.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-PAL-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-PAL-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-HSA %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-HSA %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-HSA %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-HSA %s
+
+define amdgpu_kernel void @caller_internal() {
+; GFX11-PAL-SDAG-LABEL: caller_internal:
+; GFX11-PAL-SDAG:       ; %bb.0:
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s1, internal_func@abs32@hi
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s0, internal_func@abs32@lo
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX11-PAL-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-PAL-SDAG-NEXT:    s_endpgm
+;
+; GFX11-PAL-GISEL-LABEL: caller_internal:
+; GFX11-PAL-GISEL:       ; %bb.0:
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s0, internal_func@abs32@lo
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s1, internal_func@abs32@hi
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX11-PAL-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-PAL-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-PAL-LABEL: caller_internal:
+; GFX1250-PAL:       ; %bb.0:
+; GFX1250-PAL-NEXT:    s_mov_b64 s[0:1], internal_func@abs64
+; GFX1250-PAL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-PAL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-PAL-NEXT:    s_endpgm
+;
+; GFX11-HSA-LABEL: caller_internal:
+; GFX11-HSA:       ; %bb.0:
+; GFX11-HSA-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-HSA-NEXT:    s_add_u32 s0, s0, internal_func@gotpcrel32@lo+4
+; GFX11-HSA-NEXT:    s_addc_u32 s1, s1, internal_func@gotpcrel32@hi+12
+; GFX11-HSA-NEXT:    s_mov_b32 s32, 0
+; GFX11-HSA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-HSA-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-HSA-NEXT:    s_endpgm
+;
+; GFX1250-HSA-LABEL: caller_internal:
+; GFX1250-HSA:       ; %bb.0:
+; GFX1250-HSA-NEXT:    s_get_pc_i64 s[0:1]
+; GFX1250-HSA-NEXT:    s_add_nc_u64 s[0:1], s[0:1], internal_func@gotpcrel+4
+; GFX1250-HSA-NEXT:    s_mov_b32 s32, 0
+; GFX1250-HSA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-HSA-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-HSA-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
+; GFX1250-HSA-NEXT:    s_endpgm
+  call amdgpu_gfx void @internal_func()
+  ret void
+}
+
+define amdgpu_kernel void @caller_exterinal() {
+; GFX11-PAL-SDAG-LABEL: caller_exterinal:
+; GFX11-PAL-SDAG:       ; %bb.0:
+; GFX11-PAL-SDAG-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s12, s13
+; GFX11-PAL-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-PAL-SDAG-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s17, external_func@abs32@hi
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s16, external_func@abs32@lo
+; GFX11-PAL-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-PAL-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s13, s14
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s14, s15
+; GFX11-PAL-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX11-PAL-SDAG-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX11-PAL-SDAG-NEXT:    s_endpgm
+;
+; GFX11-PAL-GISEL-LABEL: caller_exterinal:
+; GFX11-PAL-GISEL:       ; %bb.0:
+; GFX11-PAL-GISEL-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s12, s13
+; GFX11-PAL-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-PAL-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s16, external_func@abs32@lo
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s17, external_func@abs32@hi
+; GFX11-PAL-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-PAL-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s13, s14
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s14, s15
+; GFX11-PAL-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX11-PAL-GISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX11-PAL-GISEL-NEXT:    s_endpgm
+;
+; GFX1250-PAL-LABEL: caller_exterinal:
+; GFX1250-PAL:       ; %bb.0:
+; GFX1250-PAL-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1250-PAL-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1250-PAL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX1250-PAL-NEXT:    s_mov_b64 s[12:13], external_func@abs64
+; GFX1250-PAL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1250-PAL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1250-PAL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-PAL-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
+; GFX1250-PAL-NEXT:    s_endpgm
+;
+; GFX11-HSA-LABEL: caller_exterinal:
+; GFX11-HSA:       ; %bb.0:
+; GFX11-HSA-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-HSA-NEXT:    s_mov_b32 s12, s13
+; GFX11-HSA-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-HSA-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX11-HSA-NEXT:    s_getpc_b64 s[16:17]
+; GFX11-HSA-NEXT:    s_add_u32 s16, s16, external_func@rel32@lo+4
+; GFX11-HSA-NEXT:    s_addc_u32 s17, s17, external_func@rel32@hi+12
+; GFX11-HSA-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-HSA-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-HSA-NEXT:    s_mov_b32 s13, s14
+; GFX11-HSA-NEXT:    s_mov_b32 s14, s15
+; GFX11-HSA-NEXT:    s_mov_b32 s32, 0
+; GFX11-HSA-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX11-HSA-NEXT:    s_endpgm
+;
+; GFX1250-HSA-LABEL: caller_exterinal:
+; GFX1250-HSA:       ; %bb.0:
+; GFX1250-HSA-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1250-HSA-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1250-HSA-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX1250-HSA-NEXT:    s_get_pc_i64 s[12:13]
+; GFX1250-HSA-NEXT:    s_add_nc_u64 s[12:13], s[12:13], external_func@rel64+4
+; GFX1250-HSA-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1250-HSA-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1250-HSA-NEXT:    s_mov_b32 s32, 0
+; GFX1250-HSA-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
+; GFX1250-HSA-NEXT:    s_endpgm
+  call void @external_func()
+  ret void
+}
+
+declare amdgpu_gfx void @internal_func()
+declare hidden void @external_func()
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
index 2cc25d88347ee..c34c9749d553a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
@@ -17,24 +17,22 @@ body:             |
     liveins: $vgpr0, $sgpr4_sgpr5
 
     ; CHECK-LABEL: name: av_mov_b32_split
-    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5
+    ; CHECK: liveins: $agpr3, $agpr4, $vgpr0, $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
     ; CHECK-NEXT: renamable $agpr1 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
     ; CHECK-NEXT: renamable $agpr2 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
-    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 3, implicit $exec
-    ; CHECK-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 4, implicit $exec
-    ; CHECK-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec
+    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 4, implicit $exec
+    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr2
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
     %0:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
     %1:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
     %2:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
@@ -68,29 +66,25 @@ body:             |
     liveins: $vgpr0, $sgpr4_sgpr5
 
     ; CHECK-LABEL: name: v_mov_b32_split
-    ; CHECK: liveins: $vgpr0, $vgpr3, $vgpr4, $vgpr5, $sgpr4_sgpr5
+    ; CHECK: liveins: $agpr3, $agpr4, $vgpr0, $sgpr4_sgpr5
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1, implicit $exec
     ; CHECK-NEXT: renamable $vgpr2 = V_MOV_B32_e32 2, implicit $exec
     ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; CHECK-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec
     ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; CHECK-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
+    ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 4, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
-    ; CHECK-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec
+    ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr1
     ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr2
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
-    ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec
-    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
+    ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     %2:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
index c1e0d0716acae..11de6c8d52d59 100644
--- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -27,7 +27,7 @@
 # CHECK-LABEL: name: inflated_reg_class_copy_use_after_free
 # CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
 # CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
 # CHECK-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
 # CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
 # CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir
new file mode 100644
index 0000000000000..0abf34797a5e7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir
@@ -0,0 +1,67 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+
+---
+name: wmma_xdl_twoaddr_trans
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}wmma_xdl_twoaddr_trans:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[8:15], v[0:7], v[0:7], v[8:15]
+    ; CHECK-NEXT: v_exp_f32_e32 v16, v16
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v17, v17, v8
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16, $vgpr17
+    $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, 0, implicit $exec
+    $vgpr16 = V_EXP_F32_e32 $vgpr16, implicit $exec, implicit $mode
+    $vgpr17 = V_ADD_U32_e32 $vgpr17, $vgpr8, implicit $exec
+...
+
+---
+name: wmma_xdl_threeaddr_trans
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}wmma_xdl_threeaddr_trans:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[8:15], v[0:7], v[0:7], v[16:23]
+    ; CHECK-NEXT: v_exp_f32_e32 v24, v24
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v25, v25, v8
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24, $vgpr25
+    $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_threeaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec
+    $vgpr24 = V_EXP_F32_e32 $vgpr24, implicit $exec, implicit $mode
+    $vgpr25 = V_ADD_U32_e32 $vgpr25, $vgpr8, implicit $exec
+...
+
+name: swmmac_xdl_twoaddr_trans
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}swmmac_xdl_twoaddr_trans:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+    ; CHECK-NEXT: v_exp_f32_e32 v30, v30
+    ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v31, v31, v24
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
+    $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27, $vgpr28_vgpr29, 0, 0, 0, implicit $exec
+    $vgpr30 = V_EXP_F32_e32 $vgpr30, implicit $exec, implicit $mode
+    $vgpr31 = V_ADD_U32_e32 $vgpr31, $vgpr24, implicit $exec
+...
+
+name: wmma_non_xdl_large_data_valu
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: {{^}}wmma_non_xdl_large_data_valu:
+    ; CHECK: %bb.0:
+    ; CHECK-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
+    ; CHECK-NEXT: v_exp_f32_e32 v12, v12
+    ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+    ; CHECK-NEXT: v_add_nc_u32_e32 v13, v13, v8
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12, $vgpr13
+    $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, -1, 0, 0, implicit $exec
+    $vgpr12 = V_EXP_F32_e32 $vgpr12, implicit $exec, implicit $mode
+    $vgpr13 = V_ADD_U32_e32 $vgpr13, $vgpr8, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll
new file mode 100644
index 0000000000000..df4ff2c8d9851
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/literal64.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GCN-GISEL %s
+
+define amdgpu_ps i64 @s_add_u64(i64 inreg %a) {
+; GCN-LABEL: s_add_u64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], lit64(0xf12345678)
+; GCN-NEXT:    ; return to shader part epilog
+  %result = add i64 %a, 64729929336
+  ret i64 %result
+}
+
+define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: v_add_u64:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xf12345678)
+; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-SDAG-NEXT:    s_endpgm
+;
+; GCN-GISEL-LABEL: v_add_u64:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[4:5], lit64(0xf12345678)
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GCN-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-GISEL-NEXT:    s_endpgm
+  %result = add i64 %a, 64729929336
+  store i64 %result, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) {
+; GCN-LABEL: s_add_neg_u64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], lit64(0xfffffff0edcba988)
+; GCN-NEXT:    ; return to shader part epilog
+  %result = sub i64 %a, 64729929336
+  ret i64 %result
+}
+
+define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: v_add_neg_u64:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xfffffff0edcba988)
+; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-SDAG-NEXT:    s_endpgm
+;
+; GCN-GISEL-LABEL: v_add_neg_u64:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[4:5], lit64(0xfffffff0edcba988)
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GCN-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-GISEL-NEXT:    s_endpgm
+  %result = sub i64 %a, 64729929336
+  store i64 %result, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) {
+; GCN-LABEL: s_sub_u64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_sub_nc_u64 s[0:1], lit64(0xf12345678), s[0:1]
+; GCN-NEXT:    ; return to shader part epilog
+  %result = sub i64 64729929336, %a
+  ret i64 %result
+}
+
+define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) {
+; GCN-LABEL: v_sub_u64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sub_co_u32 v0, vcc_lo, 0x12345678, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_sub_co_ci_u32_e64 v1, null, 15, v1, vcc_lo
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
+  %result = sub i64 64729929336, %a
+  store i64 %result, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define void @v_mov_b64_double(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_mov_b64_double:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    v_add_f64_e32 v[2:3], lit64(0x4063233333333333), v[4:5]
+; GCN-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GCN-NEXT:    s_wait_xcnt 0x0
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GCN-NEXT:    s_wait_alu 0xfffe
+; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GCN-NEXT:    s_wait_alu 0xfffe
+; GCN-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    s_cbranch_execnz .LBB6_1
+; GCN-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, double 153.1 monotonic
+  ret void
+}
+
+define void @v_mov_b64_int(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_mov_b64_int:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], lit64(0xf12345678)
+; GCN-NEXT:    global_atomic_add_u64 v[0:1], v[2:3], off scope:SCOPE_SYS
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %result = atomicrmw add ptr addrspace(1) %ptr, i64 64729929336 monotonic
+  ret void
+}
+
+define void @store_double(ptr addrspace(1) %ptr) {
+; GCN-LABEL: store_double:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], lit64(0x4063233333333333)
+; GCN-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  store double 153.1, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i1 @class_f64() noinline optnone {
+; GCN-SDAG-LABEL: class_f64:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT:    s_mov_b32 s2, 1
+; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0x4063233333333333)
+; GCN-SDAG-NEXT:    s_wait_alu 0xfffe
+; GCN-SDAG-NEXT:    v_cmp_class_f64_e64 s0, s[0:1], s2
+; GCN-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GCN-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: class_f64:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    s_mov_b32 s2, 1
+; GCN-GISEL-NEXT:    s_mov_b64 s[0:1], lit64(0x4063233333333333)
+; GCN-GISEL-NEXT:    s_wait_alu 0xfffe
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-GISEL-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], v2
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-GISEL-NEXT:    s_wait_alu 0xf1ff
+; GCN-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s0
+; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call i1 @llvm.amdgcn.class.f64(double 153.1, i32 1) nounwind readnone
+  ret i1 %result
+}
+
+define double @rsq_f64() {
+; GCN-LABEL: rsq_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_rsq_f64_e32 v[0:1], lit64(0x4063233333333333)
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call double @llvm.amdgcn.rsq.f64(double 153.1) nounwind readnone
+  ret double %result
+}
+
+define amdgpu_ps i64 @s_and_b64(i64 inreg %a) {
+; GCN-LABEL: s_and_b64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], lit64(0xf12345678)
+; GCN-NEXT:    ; return to shader part epilog
+  %result = and i64 %a, 64729929336
+  ret i64 %result
+}
+
+; No V_AND_B64 instruction, it has to be split
+
+define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) {
+; GCN-SDAG-LABEL: v_and_b64:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    v_and_b32_e32 v1, 15, v1
+; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0x12345678, v0
+; GCN-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-SDAG-NEXT:    s_endpgm
+;
+; GCN-GISEL-LABEL: v_and_b64:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    v_and_b32_e32 v0, 0x12345678, v0
+; GCN-GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GCN-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-GISEL-NEXT:    s_endpgm
+  %result = and i64 %a, 64729929336
+  store i64 %result, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) {
+; GCN-LABEL: v_add_f64_200.1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_f64_e32 v[0:1], lit64(0x4069033333333333), v[0:1]
+; GCN-NEXT:    ; return to shader part epilog
+  %add = fadd double %a, 200.1
+  %ret = bitcast double %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+; 200.0 can be encoded as 32-bit literal
+
+define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) {
+; GCN-LABEL: v_add_f64_200.0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_f64_e32 v[0:1], 0x40690000, v[0:1]
+; GCN-NEXT:    ; return to shader part epilog
+  %add = fadd double %a, 200.0
+  %ret = bitcast double %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+; No folding into VOP3
+
+define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) {
+; GCN-SDAG-LABEL: v_lshl_add_u64:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0xf12345678)
+; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 1, s[0:1]
+; GCN-SDAG-NEXT:    ; return to shader part epilog
+;
+; GCN-GISEL-LABEL: v_lshl_add_u64:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[2:3], lit64(0xf12345678)
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
+; GCN-GISEL-NEXT:    ; return to shader part epilog
+  %shl = shl i64 %a, 1
+  %add = add i64 %shl, 64729929336
+  %ret = bitcast i64 %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+; No folding into VOP2 promoted to VOP3
+
+define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) {
+; GCN-SDAG-LABEL: v_fma_f64:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    v_fmaak_f64 v[4:5], v[0:1], v[2:3], lit64(0x4063233333333333)
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], lit64(0x4069033333333333)
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-SDAG-NEXT:    v_fmaak_f64 v[0:1], v[0:1], v[4:5], lit64(0x4069033333333333)
+; GCN-SDAG-NEXT:    v_fmac_f64_e32 v[2:3], v[0:1], v[4:5]
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GCN-SDAG-NEXT:    ; return to shader part epilog
+;
+; GCN-GISEL-LABEL: v_fma_f64:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[4:5], lit64(0x4063233333333333)
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GCN-GISEL-NEXT:    v_fmac_f64_e32 v[4:5], v[0:1], v[2:3]
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[2:3], lit64(0x4069033333333333)
+; GCN-GISEL-NEXT:    v_fmaak_f64 v[0:1], v[0:1], v[4:5], lit64(0x4069033333333333)
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-GISEL-NEXT:    v_fmac_f64_e32 v[2:3], v[0:1], v[4:5]
+; GCN-GISEL-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GCN-GISEL-NEXT:    ; return to shader part epilog
+  %r1 = call double @llvm.fma.f64(double %a, double %b, double 153.1) nounwind readnone
+  %r2 = call double @llvm.fma.f64(double %a, double %r1, double 200.1) nounwind readnone
+  %r3 = call double @llvm.fma.f64(double %r2, double %r1, double 200.1) nounwind readnone
+  %ret = bitcast double %r3 to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) {
+; GCN-SDAG-LABEL: v_add_neg_f64:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], lit64(0x4069033333333333)
+; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-SDAG-NEXT:    v_add_f64_e64 v[0:1], -v[0:1], s[0:1]
+; GCN-SDAG-NEXT:    ; return to shader part epilog
+;
+; GCN-GISEL-LABEL: v_add_neg_f64:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[2:3], lit64(0x4069033333333333)
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-GISEL-NEXT:    v_add_f64_e64 v[0:1], -v[0:1], v[2:3]
+; GCN-GISEL-NEXT:    ; return to shader part epilog
+  %fneg = fsub double -0.0, %a
+  %add = fadd double %fneg, 200.1
+  %ret = bitcast double %add to <2 x float>
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <2 x float> @v_cndmask(double %a) {
+; GCN-SDAG-LABEL: v_cndmask:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1]
+; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40632000
+; GCN-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x40690333, v1, vcc_lo
+; GCN-SDAG-NEXT:    ; return to shader part epilog
+;
+; GCN-GISEL-LABEL: v_cndmask:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1]
+; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40690333
+; GCN-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GCN-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0x40632000, vcc_lo
+; GCN-GISEL-NEXT:    ; return to shader part epilog
+  %cmp = fcmp oeq double %a, 0.0
+  %sel = select i1 %cmp, double 153.0, double 200.1
+  %ret = bitcast double %sel to <2 x float>
+  ret <2 x float> %ret
+}
+
+declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone
+declare double @llvm.amdgcn.rsq.f64(double) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 2913fe001621d..4f81d3599d1de 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,11 +8,11 @@
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,UnreachableBlockElimPass,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,consthoist,ReplaceWithVeclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,UnreachableBlockElimPass,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions,detect-dead-lanes,dead-mi-elimination,init-undef,ProcessImplicitDefsPass,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,UnreachableBlockElimPass,consthoist,ReplaceWithVeclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,ExpandReductionsPass,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,UnreachableBlockElimPass,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions,detect-dead-lanes,dead-mi-elimination,init-undef,ProcessImplicitDefsPass,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate<machine-function-info>))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index af3241e95e91d..2a5c65278f7dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -329,6 +329,7 @@
 ; GCN-O1-NEXT:        Remove dead machine instructions
 ; GCN-O1-NEXT:        SI Shrink Instructions
 ; GCN-O1-NEXT:        Register Usage Information Propagation
+; GCN-O1-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O1-NEXT:        Detect Dead Lanes
 ; GCN-O1-NEXT:        Remove dead machine instructions
 ; GCN-O1-NEXT:        Init Undef Pass
@@ -640,6 +641,7 @@
 ; GCN-O1-OPTS-NEXT:        Remove dead machine instructions
 ; GCN-O1-OPTS-NEXT:        SI Shrink Instructions
 ; GCN-O1-OPTS-NEXT:        Register Usage Information Propagation
+; GCN-O1-OPTS-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O1-OPTS-NEXT:        Detect Dead Lanes
 ; GCN-O1-OPTS-NEXT:        Remove dead machine instructions
 ; GCN-O1-OPTS-NEXT:        Init Undef Pass
@@ -956,6 +958,7 @@
 ; GCN-O2-NEXT:        Remove dead machine instructions
 ; GCN-O2-NEXT:        SI Shrink Instructions
 ; GCN-O2-NEXT:        Register Usage Information Propagation
+; GCN-O2-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O2-NEXT:        Detect Dead Lanes
 ; GCN-O2-NEXT:        Remove dead machine instructions
 ; GCN-O2-NEXT:        Init Undef Pass
@@ -1286,6 +1289,7 @@
 ; GCN-O3-NEXT:        Remove dead machine instructions
 ; GCN-O3-NEXT:        SI Shrink Instructions
 ; GCN-O3-NEXT:        Register Usage Information Propagation
+; GCN-O3-NEXT:        AMDGPU Prepare AGPR Alloc
 ; GCN-O3-NEXT:        Detect Dead Lanes
 ; GCN-O3-NEXT:        Remove dead machine instructions
 ; GCN-O3-NEXT:        Init Undef Pass
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll
new file mode 100644
index 0000000000000..091859f3c9bf3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.cos.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}cos_bf16:
+; GCN: v_cos_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat %src) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}cos_bf16_constant_4
+; GCN: v_cos_bf16_e32 v0, 4.0
+define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 4.0) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}cos_bf16_constant_100
+; GCN: v_cos_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 100.0) #0
+  store bfloat %cos, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll
new file mode 100644
index 0000000000000..6304923790ad5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.exp2.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}exp_bf16:
+; GCN: v_exp_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @exp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat %src) #0
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}exp_bf16_constant_4
+; GCN: v_exp_bf16_e32 v0, 4.0
+define amdgpu_kernel void @exp_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 4.0) #0
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}exp_bf16_constant_100
+; GCN: v_exp_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @exp_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 100.0) #0
+  store bfloat %exp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll
new file mode 100644
index 0000000000000..a8b2077f5a35b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.log.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}log_bf16:
+; GCN: v_log_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @log_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %log = call bfloat @llvm.amdgcn.log.bf16(bfloat %src) #0
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}log_bf16_constant_4
+; GCN: v_log_bf16_e32 v0, 4.0
+define amdgpu_kernel void @log_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 4.0) #0
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}log_bf16_constant_100
+; GCN: v_log_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @log_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 100.0) #0
+  store bfloat %log, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 67ae05eb6f0b8..561eaca3b77df 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -4365,8 +4365,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm:
 ; NOLIT-SRCC:       ; %bb.0: ; %bb
 ; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1.0
-; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, 0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a4, 0
@@ -4465,8 +4465,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm:
 ; LIT-SRCC:       ; %bb.0: ; %bb
 ; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1.0
-; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, 0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
new file mode 100644
index 0000000000000..3c49d0b9c01b1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefix=SDAG-FAKE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=GI-TRUE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefix=GI-FAKE16 %s
+
+; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.rcp.bf16(bfloat) #0
+
+define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+; SDAG-TRUE16-LABEL: rcp_bf16:
+; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-TRUE16-NEXT:    v_rcp_bf16_e32 v0.l, s2
+; SDAG-TRUE16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rcp_bf16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_rcp_bf16_e32 v0, s2
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rcp = call bfloat @llvm.amdgcn.rcp.bf16(bfloat %src) #0
+  store bfloat %rcp, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
new file mode 100644
index 0000000000000..0a8a90422d1f2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
+
+; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.rsq.bf16(bfloat) #0
+
+define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+; SDAG-REAL16-LABEL: rsq_bf16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, s2
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_bf16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, s2
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat %src) #0
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: rsq_bf16_constant_4:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, 4.0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_bf16_constant_4:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, 4.0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 4.0) #0
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: rsq_bf16_constant_100:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, 0x42c8
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_bf16_constant_100:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, 0x42c8
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 100.0) #0
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @rsq_undef_bf16(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: rsq_undef_bf16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: rsq_undef_bf16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat undef)
+  store bfloat %rsq, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll
new file mode 100644
index 0000000000000..9c35a7eae0b8e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.sin.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}sin_bf16:
+; GCN: v_sin_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat %src) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}sin_bf16_constant_4
+; GCN: v_sin_bf16_e32 v0, 4.0
+define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 4.0) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}sin_bf16_constant_100
+; GCN: v_sin_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 100.0) #0
+  store bfloat %sin, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll
new file mode 100644
index 0000000000000..5287b5dba848f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.sqrt.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}sqrt_bf16:
+; GCN: v_sqrt_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+  %sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat %src) #0
+  store bfloat %sqrt, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}sqrt_bf16_constant_4
+; GCN: v_sqrt_bf16_e32 v0, 4.0
+define amdgpu_kernel void @sqrt_bf16_constant_4(ptr addrspace(1) %out) #1 {
+  %sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat 4.0) #0
+  store bfloat %sqrt, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}sqrt_bf16_constant_100
+; GCN: v_sqrt_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @sqrt_bf16_constant_100(ptr addrspace(1) %out) #1 {
+  %sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat 100.0) #0
+  store bfloat %sqrt, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
index 344c0112e4a54..dd89f80a54949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
@@ -1,14 +1,180 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
 ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
 ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
 
-; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
 ; FIXME: GlobalISel does not work with bf16
 
+declare float @llvm.amdgcn.tanh.f32(float) #0
+declare half @llvm.amdgcn.tanh.f16(half) #0
 declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0
 
+define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
+; SDAG-REAL16-LABEL: tanh_f32:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, s2
+; SDAG-REAL16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f32:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, s2
+; SDAG-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float %src) #0
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f32_constant_4.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, 4.0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, 4.0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float 4.0) #0
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f32_constant_100.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, 0x42c80000
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, 0x42c80000
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float 100.0) #0
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_undef_f32:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_undef_f32:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call float @llvm.amdgcn.tanh.f32(float undef)
+  store float %tanh, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 {
+; SDAG-REAL16-LABEL: tanh_f16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, s2
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, s2
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half %src) #0
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f16_constant_4.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, 4.0
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16_constant_4.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, 4.0
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half 4.0) #0
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f16_constant_100.0:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, 0x5640
+; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT:    flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16_constant_100.0:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, 0x5640
+; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half 100.0) #0
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_undef_f16:
+; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_undef_f16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_endpgm
+  %tanh = call half @llvm.amdgcn.tanh.f16(half undef)
+  store half %tanh, ptr addrspace(1) %out, align 2
+  ret void
+}
+
 define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ; SDAG-REAL16-LABEL: tanh_bf16:
 ; SDAG-REAL16:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
new file mode 100644
index 0000000000000..2f5ff90c9274f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -0,0 +1,840 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GISEL-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_i32_16x16x64_iu8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 true)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_32x16x128_f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_32x16x128_f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 true)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 true)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 true)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x64_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
+; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 true)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i64 %Index, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
new file mode 100644
index 0000000000000..fe8358fcc7a9a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -0,0 +1,2238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GISEL-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v6, 1.0 :: v_dual_mov_b32 v8, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v6
+; GFX1250-NEXT:    v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v6
+; GFX1250-NEXT:    v_dual_mov_b32 v12, v6 :: v_dual_mov_b32 v13, v6
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GISEL-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX1250-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX1250-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX1250-NEXT:    v_mov_b32_e32 v13, v6
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GISEL-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GISEL-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GISEL-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GISEL-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GISEL-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GISEL-NEXT:    v_mov_b32_e32 v25, v18
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 0.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3fc03fc0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_mov_b32_e32 v18, 0x3fc03fc0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GISEL-NEXT:    v_mov_b32_e32 v21, v18
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> <bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5, bfloat 1.5>, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GISEL-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GISEL-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GISEL-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GISEL-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GISEL-NEXT:    v_mov_b32_e32 v25, v18
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_i32_16x16x64_iu8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1 :: v_dual_mov_b32 v20, 2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1
+; GISEL-NEXT:    s_mov_b32 s2, 2
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 1, i32 1, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x80
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_movk_i32 s0, 0x80
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18
+; GFX1250-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v25, v18
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
+; GFX1250-NEXT:    v_mov_b32_e32 v21, v18
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21]
+; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
+; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 1.0, half 1.0, half 2.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v37, v34
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37]
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v34
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_32x16x128_f4:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_32x16x128_f4:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_splat:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v28, 2.0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v29, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v36, v28 :: v_dual_mov_b32 v37, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_splat:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 1.0
+; GISEL-NEXT:    s_mov_b32 s2, 2.0
+; GISEL-NEXT:    s_mov_b32 s14, s0
+; GISEL-NEXT:    s_mov_b32 s15, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s8, s0
+; GISEL-NEXT:    s_mov_b32 s9, s0
+; GISEL-NEXT:    s_mov_b32 s10, s2
+; GISEL-NEXT:    s_mov_b32 s11, s0
+; GISEL-NEXT:    s_mov_b32 s12, s0
+; GISEL-NEXT:    s_mov_b32 s13, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
+; GFX1250-NEXT:    v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
+; GFX1250-NEXT:    v_mov_b32_e32 v41, v26
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT:    s_mov_b32 s14, s0
+; GISEL-NEXT:    s_mov_b32 s15, s0
+; GISEL-NEXT:    s_mov_b32 s1, s0
+; GISEL-NEXT:    s_mov_b32 s2, s0
+; GISEL-NEXT:    s_mov_b32 s3, s0
+; GISEL-NEXT:    s_mov_b32 s4, s0
+; GISEL-NEXT:    s_mov_b32 s5, s0
+; GISEL-NEXT:    s_mov_b32 s6, s0
+; GISEL-NEXT:    s_mov_b32 s7, s0
+; GISEL-NEXT:    s_mov_b32 s8, s0
+; GISEL-NEXT:    s_mov_b32 s9, s0
+; GISEL-NEXT:    s_mov_b32 s10, s0
+; GISEL-NEXT:    s_mov_b32 s11, s0
+; GISEL-NEXT:    s_mov_b32 s12, s0
+; GISEL-NEXT:    s_mov_b32 s13, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[6:7]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[4:5]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[30:33], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[34:37], off offset:32
+; GISEL-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
new file mode 100644
index 0000000000000..9802144a29577
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
@@ -0,0 +1,1993 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negA(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GISEL-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 1, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negB(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GISEL-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 1, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GISEL-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32_neg_absC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GISEL-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x4_f32_ignoreC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x4_f32_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GISEL-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x bfloat> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x bfloat> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x bfloat> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedA(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 1, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedB(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 1, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 1, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 1, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x32_f16_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
+; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 1, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 1, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19]
+; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x32_f16_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19]
+; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
+; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_negC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_32x16x128_f4_negC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_neg_absC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_32x16x128_f4_neg_absC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_ignoreC:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_wmma_f32_32x16x128_f4_ignoreC:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
+; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:32
+; GISEL-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C)
+  store <16 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 1, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 1, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 1, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 1, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 1, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 1, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedA(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 1, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedB(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 0, <8 x i32> %A, i1 1, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 1, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 1, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negA:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negA:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 1, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negB:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negB:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 1, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1)
+declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>)
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i64, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll
new file mode 100644
index 0000000000000..b8745e0ebf480
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll
@@ -0,0 +1,690 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b32 v32, v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b32 v32, v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4
+  %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16>
+  %Index = extractelement <2 x i16> %IndexVec, i32 1
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b32 v28, v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b32 v28, v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4
+  %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16>
+  %Index = extractelement <2 x i16> %IndexVec, i32 1
+  %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false)
+  store <8 x bfloat> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b32 v32, v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b32 v32, v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4
+  %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16>
+  %Index = extractelement <2 x i16> %IndexVec, i32 1
+  %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 4
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 4
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+; GISEL-NEXT:    global_store_b64 v[30:31], v[28:29], off
+; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32>
+  %Index = extractelement <2 x i32> %IndexVec, i32 1
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i32 %Index, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GFX1250-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+; GISEL-NEXT:    global_store_b64 v[34:35], v[32:33], off
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8
+  store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr
+  %Index = lshr i64 %IndexVecPacked, 32
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 false)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b32 v32, v[32:33], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f32_16x16x64_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b32 v32, v[32:33], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
+; GISEL-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4
+  %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16>
+  %Index = extractelement <2 x i16> %IndexVec, i32 1
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16:
+; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    global_load_b32 v28, v[28:29], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GFX1250-NEXT:    s_endpgm
+;
+; GISEL-LABEL: test_swmmac_f16_16x16x64_f16:
+; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    global_load_b32 v28, v[28:29], off
+; GISEL-NEXT:    s_wait_loadcnt 0x0
+; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
+; GISEL-NEXT:    s_endpgm
+bb:
+  %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4
+  %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16>
+  %Index = extractelement <2 x i16> %IndexVec, i32 1
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i32, i1, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
new file mode 100644
index 0000000000000..0f37639059169
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
@@ -0,0 +1,1013 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-SDAG-FAKE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-GI-TRUE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-GI-FAKE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-SDAG-FAKE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-GI-TRUE16
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-GI-FAKE16
+
+define bfloat @v_exp2_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call bfloat @llvm.exp2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_fabs_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fabs_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fabs_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, |v0.l|
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, |v0|
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %result = call bfloat @llvm.exp2.bf16(bfloat %fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_fneg_fabs_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -|v0.l|
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -|v0|
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %fneg.fabs = fneg bfloat %fabs
+  %result = call bfloat @llvm.exp2.bf16(bfloat %fneg.fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_fneg_bf16(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg bfloat %in
+  %result = call bfloat @llvm.exp2.bf16(bfloat %fneg)
+  ret bfloat %result
+}
+
+define bfloat @v_exp2_bf16_fast(bfloat %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_bf16_fast:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_bf16_fast:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_bf16_fast:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_bf16_fast:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast bfloat @llvm.exp2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define <2 x bfloat> @v_exp2_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.h, v0.h
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v1, v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v1.l
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.h, v2.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v1, v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 15
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v1.l
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, 0x8000, v2.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -v1.l
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.h, -v2.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v1, -v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %fneg.fabs = fneg <2 x bfloat> %fabs
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fneg.fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_fneg_v2bf16(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX1200-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.h
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.h, -v0.h
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e64 v0.l, -v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v0, -v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e64 v1, -v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg <2 x bfloat> %in
+  %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fneg)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_exp2_v2bf16_fast(<2 x bfloat> %in) {
+; GFX1200-SDAG-TRUE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1200-SDAG-TRUE16:       ; %bb.0:
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX1200-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1200-SDAG-FAKE16:       ; %bb.0:
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX1200-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1200-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1200-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1200-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1200-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-SDAG-TRUE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.h, v0.h
+; GFX1250-SDAG-TRUE16-NEXT:    v_exp_bf16_e32 v0.l, v0.l
+; GFX1250-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-SDAG-FAKE16-LABEL: v_exp2_v2bf16_fast:
+; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v0, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-SDAG-FAKE16-NEXT:    v_exp_bf16_e32 v1, v1
+; GFX1250-SDAG-FAKE16-NEXT:    v_nop
+; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+declare bfloat @llvm.exp2.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat>) #0
+declare bfloat @llvm.fabs.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll
new file mode 100644
index 0000000000000..5bd9fa6f23aa0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-FAKE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-TRUE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-FAKE16 %s
+
+define bfloat @v_log2_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_fabs_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, |v0.l|
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, |v0|
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %result = call bfloat @llvm.log2.bf16(bfloat %fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_fneg_fabs_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -|v0.l|
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -|v0|
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+  %fneg.fabs = fneg bfloat %fabs
+  %result = call bfloat @llvm.log2.bf16(bfloat %fneg.fabs)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_fneg_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg bfloat %in
+  %result = call bfloat @llvm.log2.bf16(bfloat %fneg)
+  ret bfloat %result
+}
+
+define bfloat @v_log2_bf16_fast(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_bf16_fast:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_bf16_fast:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast bfloat @llvm.log2.bf16(bfloat %in)
+  ret bfloat %result
+}
+
+define <2 x bfloat> @v_log2_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.h, v0.h
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v1.l
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.h, v2.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 15
+; GFX-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -v1.l
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.h, -v2.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-FAKE16-NEXT:    v_bfe_u32 v0, v0, 16, 15
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v1, -v1
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+  %fneg.fabs = fneg <2 x bfloat> %fabs
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg.fabs)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fneg_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_v2bf16:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.h, -v0.h
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e64 v0.l, -v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_v2bf16:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e64 v1, -v1
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %fneg = fneg <2 x bfloat> %in
+  %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg)
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_v2bf16_fast(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16_fast:
+; GFX-SDAG-TRUE16:       ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.h, v0.h
+; GFX-SDAG-TRUE16-NEXT:    v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16_fast:
+; GFX-SDAG-FAKE16:       ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT:    v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT:    v_nop
+; GFX-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
+  %result = call fast <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+  ret <2 x bfloat> %result
+}
+
+declare bfloat @llvm.log2.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat>) #0
+declare bfloat @llvm.fabs.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
index 28781ae9f13c7..c6cf6f64db1eb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index af914bd4043cf..355f77acfd302 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -76,12 +76,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_movk_i32 s4, 0xfc01
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0xfffffc01, v4
+; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
 ; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
 ; SI-NEXT:    v_not_b32_e32 v5, v5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
new file mode 100644
index 0000000000000..47b2b68f05abc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
+
+; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
+
+declare bfloat @llvm.sqrt.bf16(bfloat %a)
+declare <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
+
+define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
+; GFX12-TRUE16-LABEL: sqrt_bf16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: sqrt_bf16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    s_endpgm
+entry:
+  %a.val = load bfloat, ptr addrspace(1) %a
+  %r.val = call bfloat @llvm.sqrt.bf16(bfloat %a.val)
+  store bfloat %r.val, ptr addrspace(1) %r
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
+; GFX12-TRUE16-LABEL: sqrt_v2bf16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_nop
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: sqrt_v2bf16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_sqrt_bf16_e32 v1, v0
+; GFX12-FAKE16-NEXT:    v_nop
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX12-FAKE16-NEXT:    v_sqrt_bf16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    s_endpgm
+entry:
+  %a.val = load <2 x bfloat>, ptr addrspace(1) %a
+  %r.val = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a.val)
+  store <2 x bfloat> %r.val, ptr addrspace(1) %r
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx10.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx10.mir
new file mode 100644
index 0000000000000..402c00298c8da
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx10.mir
@@ -0,0 +1,1526 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX10 %s
+
+---
+name: gfx10_tbuffer_load_x_xyz
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xyz_x
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xyz_x
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xy
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_x
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_x
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_format_32_32_32_32
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 77, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_float_32
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_float_32
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_sint_32
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_sint_32
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 76, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_uint_32
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_uint_32
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 62, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 75, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_not_merged_data_format_mismatch
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_not_merged_num_format_mismatch
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_x_xyz
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xyz
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_xyz_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_xyz_x
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_xy_xy
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_xy
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_x_xy
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xy
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_xy_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_x
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_x_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_x_x_format_32_32_32_32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 77, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_float32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_float32
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_sint32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_sint32
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 76, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_uint32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_uint32
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 62, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 75, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_not_merged_data_format_mismatch
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_store_not_merged_num_format_mismatch
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
+    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_not_merged_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_not_merged_swizzled_1
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_merge_across_swizzle
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %5:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %6:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_idxen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xy_idxen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_idxen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xyz_idxen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN %4, %5:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_x_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_bothen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xy_bothen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_bothen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xyz_bothen
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN %4, %5:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xy_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xyz_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact %4, %5:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xy_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_xyz_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_x_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:vreg_64 = COPY $vgpr1
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_32 = COPY $sgpr4
+    %5:vreg_64 = COPY $vgpr0
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:vgpr_32 = COPY $vgpr1
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
+body:             |
+  bb.0.entry:
+    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
+    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_32 = COPY $sgpr4
+    %5:vgpr_32 = COPY $vgpr0
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %7:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir
new file mode 100644
index 0000000000000..62cc5659fcc6b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir
@@ -0,0 +1,1527 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX11 %s
+
+---
+name: gfx11_tbuffer_load_x_xyz
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xyz_x
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xyz_x
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xy
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_x
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_x
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_format_32_32_32_32
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_float_32
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_float_32
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_sint_32
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_sint_32
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 49, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 62, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 59, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_uint_32
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_uint_32
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 48, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 61, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 58, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_not_merged_data_format_mismatch
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_not_merged_num_format_mismatch
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_x_xyz
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xyz
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_xyz_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_xyz_x
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_xy_xy
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_x_xy
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xy
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_xy_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_x
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_x_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_x_x_format_32_32_32_32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_float32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_float32
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_sint32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_sint32
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 49, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 62, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 59, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_uint32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_uint32
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 48, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 61, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 58, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_not_merged_data_format_mismatch
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_not_merged_num_format_mismatch
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
+    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_not_merged_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_not_merged_swizzled_1
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_merge_across_swizzle
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_idxen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xy_idxen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xyz_idxen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN %4, %5:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_bothen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xy_bothen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_bothen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xyz_bothen
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN %4, %5:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xy_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xyz_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact %4, %5:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_x_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xy_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_xyz_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:vreg_64 = COPY $vgpr1
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_32 = COPY $sgpr4
+    %5:vreg_64 = COPY $vgpr0
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:vgpr_32 = COPY $vgpr1
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_32 = COPY $sgpr4
+    %5:vgpr_32 = COPY $vgpr0
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %7:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_x_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
+    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
+    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx9.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx9.mir
new file mode 100644
index 0000000000000..3a43e743de493
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx9.mir
@@ -0,0 +1,1553 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
+
+---
+name: gfx9_tbuffer_load_x_xyz
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xyz_x
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xyz_x
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xy
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_x
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_x
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+
+name: gfx9_tbuffer_load_x_x
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_format_32_32_32_32
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_float_32
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_float_32
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_sint_32
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_sint_32
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 91, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 94, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 93, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_uint_32
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_uint_32
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 78, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 77, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
+    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_not_merged_data_format_mismatch
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_not_merged_num_format_mismatch
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_x_xyz
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_xyz
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_xyz_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_xyz_x
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_xy_xy
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_xy_xy
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_x_xy
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_xy
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_xy_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_xy_x
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_x_x
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_x
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_x_x_format_32_32_32_32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_float32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_float32
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX9-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_sint32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_sint32
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 91, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX9-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 93, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_uint32
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_uint32
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+    ; GFX9-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 78, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 77, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_not_merged_data_format_mismatch
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_store_not_merged_num_format_mismatch
+body:             |
+  bb.0.entry:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
+    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    %12:vgpr_32 = COPY $vgpr8
+    %11:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %9:vgpr_32 = COPY $vgpr5
+    %8:vgpr_32 = COPY $vgpr4
+    %7:vgpr_32 = COPY $vgpr3
+    %6:vgpr_32 = COPY $vgpr2
+    %5:vgpr_32 = COPY $vgpr1
+    %4:vgpr_32 = COPY $vgpr0
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_not_merged_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_not_merged_swizzled_1
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_merge_across_swizzle
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %5:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %6:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_merge_across_swizzled_store
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %6:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %5:sgpr_128, 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_idxen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_idxen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xy_idxen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_idxen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xyz_idxen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN %4, %5:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_bothen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_bothen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xy_bothen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_bothen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xyz_bothen
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN %4, %5:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xy_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xyz_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact %4, %5:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_x_idxen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xy_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_xyz_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_x_bothen_exact
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vreg_64 = COPY $vgpr0
+    %5:vreg_64 = COPY $vgpr1
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_32 = COPY $sgpr4
+    %5:vreg_64 = COPY $vgpr0
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:vgpr_32 = COPY $vgpr0
+    %5:vgpr_32 = COPY $vgpr1
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
+body:             |
+  bb.0.entry:
+    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
+    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %0:sgpr_32 = COPY $sgpr0
+    %1:sgpr_32 = COPY $sgpr1
+    %2:sgpr_32 = COPY $sgpr2
+    %3:sgpr_32 = COPY $sgpr3
+    %4:sgpr_32 = COPY $sgpr4
+    %5:vgpr_32 = COPY $vgpr0
+    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %7:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
deleted file mode 100644
index 9766b427b4325..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
+++ /dev/null
@@ -1,8708 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX11 %s
-
-#
-# GFX9 tests
-#
-
----
-name: gfx9_tbuffer_load_x_xyz
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xyz_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xyz_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xyz_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xyz_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xy
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-
-name: gfx9_tbuffer_load_x_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_format_32_32_32_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_float_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_float_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_float_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_float_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_sint_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_sint_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 91, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 94, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 93, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_sint_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_sint_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_uint_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_uint_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 78, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 77, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_uint_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_uint_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_not_merged_data_format_mismatch
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_not_merged_num_format_mismatch
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_x_xyz
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_xyz
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_x_xyz
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_x_xyz
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_xyz_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_xyz_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_xyz_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_xyz_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_xy_xy
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_xy_xy
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_xy_xy
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_xy_xy
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_x_xy
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_xy
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_x_xy
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_x_xy
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_xy_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_xy_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_xy_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_xy_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_x_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_x_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_x_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_x_x_format_32_32_32_32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_float32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_float32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX9-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_float32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_float32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_sint32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_sint32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 91, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX9-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 94, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 93, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_sint32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_sint32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_uint32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_uint32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX9-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 78, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 77, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_uint32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_uint32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_not_merged_data_format_mismatch
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_store_not_merged_num_format_mismatch
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_not_merged_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_not_merged_swizzled_1
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_merge_across_swizzle
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %5:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %6:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_merge_across_swizzled_store
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %6:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %5:sgpr_128, 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xy_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xyz_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN %4, %5:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xy_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xyz_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN %4, %5:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xy_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xyz_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact %4, %5:sgpr_128, 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_x_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xy_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_bothen_exact
-body:             |
-  bb.0.entry:
-
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_xyz_bothen_exact
-body:             |
-  bb.0.entry:
-
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_x_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:vreg_64 = COPY $vgpr1
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_32 = COPY $sgpr4
-    %5:vreg_64 = COPY $vgpr0
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:vgpr_32 = COPY $vgpr1
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_32 = COPY $sgpr4
-    %5:vgpr_32 = COPY $vgpr0
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %7:sgpr_128, 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-#
-# GFX10 tests
-#
-
-name: gfx10_tbuffer_load_x_xyz
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xyz
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xyz_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xyz_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xyz_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xyz_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xy
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xy
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_format_32_32_32_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 77, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 77, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_float_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_float_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_float_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_float_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_sint_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_sint_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_sint_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 76, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_sint_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 49, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 62, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 59, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_uint_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_uint_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_uint_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 62, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 75, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_uint_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 48, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 61, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 58, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_not_merged_data_format_mismatch
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_not_merged_num_format_mismatch
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_x_xyz
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_x_xyz
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xyz
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_x_xyz
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_xyz_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_xyz_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_xyz_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_xyz_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_xy_xy
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_xy_xy
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_xy
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_xy_xy
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_x_xy
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_x_xy
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xy
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_x_xy
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_xy_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_xy_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_xy_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_x_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_x_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_x_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_x_x_format_32_32_32_32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 77, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 77, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_float32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_float32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_float32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_float32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_sint32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_sint32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_sint32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 76, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_sint32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 49, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 62, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 59, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_uint32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_uint32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_uint32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 62, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 75, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_uint32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 48, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 61, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 58, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_not_merged_data_format_mismatch
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_store_not_merged_num_format_mismatch
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_not_merged_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_not_merged_swizzled_1
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_merge_across_swizzle
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %5:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %6:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xy_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xyz_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN %4, %5:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_x_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xy_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xyz_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN %4, %5:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xy_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xyz_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact %4, %5:sgpr_128, 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xy_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_bothen_exact
-body:             |
-  bb.0.entry:
-
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_xyz_bothen_exact
-body:             |
-  bb.0.entry:
-
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_x_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:vreg_64 = COPY $vgpr1
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_32 = COPY $sgpr4
-    %5:vreg_64 = COPY $vgpr0
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:vgpr_32 = COPY $vgpr1
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_32 = COPY $sgpr4
-    %5:vgpr_32 = COPY $vgpr0
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %7:sgpr_128, 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-
-#
-# GFX11 tests
-#
-
-name: gfx11_tbuffer_load_x_xyz
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xyz
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xyz_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xyz_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xyz_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xyz_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xy
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xy
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_format_32_32_32_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_float_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_float_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_float_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_float_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 63, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_sint_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_sint_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_sint_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 76, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_sint_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 49, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 62, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 59, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_uint_32
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_uint_32
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_uint_32
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 62, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 75, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_uint_32
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 48, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 16, 61, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 36, 58, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
-    ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
-    ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
-    ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_not_merged_data_format_mismatch
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_not_merged_num_format_mismatch
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET4:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET5:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_x_xyz
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_x_xyz
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_x_xyz
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xyz
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_xyz_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_xyz_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_xyz_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_xyz_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
-    TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_xy_xy
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_xy_xy
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_xy_xy
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_x_xy
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_x_xy
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_x_xy
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xy
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_xy_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_xy_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_xy_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
-    TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_x_x
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_x_x
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_x_x
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_x_x_format_32_32_32_32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_float32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_float32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_float32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_float32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 63, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_sint32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_sint32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_sint32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 76, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_sint32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 49, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 62, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 59, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_uint32
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_uint32
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_uint32
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 62, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX10-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 75, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_uint32
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 48, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
-    ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 61, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
-    ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; GFX11-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 58, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_not_merged_data_format_mismatch
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_store_not_merged_num_format_mismatch
-body:             |
-  bb.0.entry:
-    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
-    ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY8]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY7]], [[REG_SEQUENCE]], 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY6]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY5]], [[REG_SEQUENCE]], 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY3]], [[REG_SEQUENCE]], 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
-    %12:vgpr_32 = COPY $vgpr8
-    %11:vgpr_32 = COPY $vgpr7
-    %10:vgpr_32 = COPY $vgpr6
-    %9:vgpr_32 = COPY $vgpr5
-    %8:vgpr_32 = COPY $vgpr4
-    %7:vgpr_32 = COPY $vgpr3
-    %6:vgpr_32 = COPY $vgpr2
-    %5:vgpr_32 = COPY $vgpr1
-    %4:vgpr_32 = COPY $vgpr0
-    %3:sgpr_32 = COPY $sgpr3
-    %2:sgpr_32 = COPY $sgpr2
-    %1:sgpr_32 = COPY $sgpr1
-    %0:sgpr_32 = COPY $sgpr0
-    %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-    TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_not_merged_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_not_merged_swizzled_1
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_merge_across_swizzle
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xy_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %4, %5:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xyz_idxen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN %4, %5:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xy_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN %4, %5:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xyz_bothen
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN %4, %5:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xy_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %5:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xyz_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact %4, %5:sgpr_128, 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_x_idxen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xy_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 60, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_bothen_exact
-body:             |
-  bb.0.entry:
-
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_xyz_bothen_exact
-body:             |
-  bb.0.entry:
-
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:vreg_64 = COPY $vgpr1
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_32 = COPY $sgpr4
-    %5:vreg_64 = COPY $vgpr0
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vgpr_32 = COPY $vgpr0
-    %5:vgpr_32 = COPY $vgpr1
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %4, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:sgpr_32 = COPY $sgpr4
-    %5:vgpr_32 = COPY $vgpr0
-    %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
-    %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %6:sgpr_128, 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact %5, %7:sgpr_128, 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_x_bothen_exact
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
-    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0_sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
-    ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
-    ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
----
-
-name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-body:             |
-  bb.0.entry:
-    ; GFX9-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ;
-    ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    ;
-    ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
-    ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
-    %0:sgpr_32 = COPY $sgpr0
-    %1:sgpr_32 = COPY $sgpr1
-    %2:sgpr_32 = COPY $sgpr2
-    %3:sgpr_32 = COPY $sgpr3
-    %4:vreg_64 = COPY $vgpr0
-    %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
-    %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-    %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-...
diff --git a/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir
new file mode 100644
index 0000000000000..95ccf6c0a4a33
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir
@@ -0,0 +1,357 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GCN %s
+
+---
+name:            flat_load_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_load_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, %3, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_load_saddr_to_valu_non_zero_vaddr
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_load_saddr_to_valu_non_zero_vaddr
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub0, implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub1, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GCN-NEXT:   [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+      %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, %3, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+
+---
+name:            flat_load_saddr_to_valu_undef_vaddr
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_load_saddr_to_valu_undef_vaddr
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub0, implicit $exec
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub1, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+  ; GCN-NEXT:   [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[REG_SEQUENCE]], undef %4:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, undef %3:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_store_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_store_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   FLAT_STORE_DWORD [[PHI]], [[DEF]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      %4:vgpr_32 = IMPLICIT_DEF
+      FLAT_STORE_DWORD_SADDR %3, %4, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_atomic_noret_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_atomic_noret_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %6, %bb.1
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   FLAT_ATOMIC_ADD [[PHI]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      FLAT_ATOMIC_ADD_SADDR %3, %3, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            flat_atomic_rtn_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: flat_atomic_rtn_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[PHI]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+      %0:sreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+      %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1
+      %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+      %4:vgpr_32 = FLAT_ATOMIC_ADD_SADDR_RTN %3, %3, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc
+      S_CMP_LG_U64 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            scratch_load_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: scratch_load_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1
+  ; GCN-NEXT:   [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+      %0:sgpr_32 = COPY $vgpr0
+
+  bb.1:
+      %1:sgpr_32 = PHI %0, %bb.0, %2, %bb.1
+      %4:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sgpr_32 = S_AND_B32 %1, 1, implicit-def $scc
+      S_CMP_LG_U32 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
+
+---
+name:            scratch_store_saddr_to_valu
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: scratch_store_saddr_to_valu
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD [[DEF]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
+  ; GCN-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
+  ; GCN-NEXT:   $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0
+      %0:sgpr_32 = COPY $vgpr0
+
+  bb.1:
+      %1:sgpr_32 = PHI %0, %bb.0, %2, %bb.1
+      %4:vgpr_32 = IMPLICIT_DEF
+      SCRATCH_STORE_DWORD_SADDR %4, %1, 0, 0, implicit $exec, implicit $flat_scr
+      %2:sgpr_32 = S_AND_B32 %1, 1, implicit-def $scc
+      S_CMP_LG_U32 %2, 0, implicit-def $scc
+      S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+      S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 3844d6054e130..cf244f0b1f884 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,16 +6,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-LABEL: matmul_kernel:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX942-NEXT:    s_mov_b32 s2, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_branch .LBB0_2
 ; GFX942-NEXT:  .LBB0_1: ; %bb2
 ; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
@@ -43,16 +42,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX908-LABEL: matmul_kernel:
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX908-NEXT:    s_mov_b32 s2, 0
-; GFX908-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT:    s_mov_b32 s3, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX908-NEXT:    s_mov_b32 s3, 0
 ; GFX908-NEXT:    s_branch .LBB0_2
 ; GFX908-NEXT:  .LBB0_1: ; %bb2
 ; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
index ee5481617cf59..01506d0af1913 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -80,16 +80,16 @@ body:             |
   ; COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
   ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
   ; COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
   ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
-  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.1:
   ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; COALESCE-NEXT: {{  $}}
-  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -103,10 +103,10 @@ body:             |
   ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
   ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.3:
@@ -134,16 +134,16 @@ body:             |
   ; GFX908-COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
   ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
   ; GFX908-COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.1:
   ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -157,10 +157,10 @@ body:             |
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
index 49c0aaf9fb390..a9207de317ea1 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -67,7 +67,7 @@ body:             |
   ; COALESCE-NEXT: bb.1:
   ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
   ; COALESCE-NEXT: {{  $}}
-  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   S_BRANCH %bb.3
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.2:
@@ -78,13 +78,13 @@ body:             |
   ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.3:
-  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
   ; COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
@@ -105,28 +105,28 @@ body:             |
   ; GFX908-COALESCE-NEXT: bb.1:
   ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.3
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.2:
   ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   undef [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[AV_MOV_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
-  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.3:
-  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
   ; GFX908-COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX908-COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
index 4459011742687..c529d5dde0d73 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
@@ -38,6 +38,9 @@
 @printed.str.ptr.null = private addrspace(4) constant ptr null, align 4
 @printed.str.ptr.undef = private addrspace(4) constant ptr poison, align 4
 @format.str.f = private unnamed_addr addrspace(4) constant [33 x i8] c"%f %f %f %f %f %f %f %f %f %f %f\00", align 1
+@format.str.F = private unnamed_addr addrspace(4) constant [33 x i8] c"%F %F %F %F %F %F %F %F %F %F %F\00", align 1
+@format.str.a = private unnamed_addr addrspace(4) constant [33 x i8] c"%a %a %a %a %a %a %a %a %a %a %a\00", align 1
+@format.str.A = private unnamed_addr addrspace(4) constant [33 x i8] c"%A %A %A %A %A %A %A %A %A %A %A\00", align 1
 @format.str.p = private unnamed_addr addrspace(4) constant [15 x i8] c"%p %p %p %p %p\00", align 1
 @format.str.d = private unnamed_addr addrspace(4) constant [30 x i8] c"%d %d %d %d %d %d %d %d %d %d\00", align 1
 
@@ -102,6 +105,159 @@ define amdgpu_kernel void @format_str_f(float %f32.0, double %f64, float %f32.1,
   ret void
 }
 
+define amdgpu_kernel void @format_str_F(float %f32.0, double %f64, float %f32.1, i16 %i16, i32 %i32, i64 %i64, half %f16) {
+; R600-LABEL: @format_str_F(
+; R600-NEXT:    [[FPEXT_F32_TO_F64:%.*]] = fpext float [[F32_1:%.*]] to double
+; R600-NEXT:    [[CALL1:%.*]] = call i32 @printf(ptr addrspace(4) @format.str.F, float [[F32_0:%.*]], double [[F64:%.*]], double [[FPEXT_F32_TO_F64]], float 1.000000e+00, double 2.000000e+00, i16 [[I16:%.*]], i32 [[I32:%.*]], i64 [[I64:%.*]], <2 x float> <float 1.000000e+00, float 2.000000e+00>, <2 x i32> <i32 8, i32 234>, half [[F16:%.*]])
+; R600-NEXT:    ret void
+;
+; GCN-LABEL: @format_str_F(
+; GCN-NEXT:    [[FPEXT_F32_TO_F64:%.*]] = fpext float [[F32_1:%.*]] to double
+; GCN-NEXT:    [[TMP4:%.*]] = sext i16 [[I16:%.*]] to i32
+; GCN-NEXT:    [[TMP5:%.*]] = bitcast half [[F16:%.*]] to i16
+; GCN-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
+; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
+; GCN:       .split:
+; GCN-NEXT:    [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+; GCN-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
+; GCN:       5:
+; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
+; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
+; GCN-NEXT:    store i32 2, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+; GCN-NEXT:    store float [[F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
+; GCN-NEXT:    store double [[F64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+; GCN-NEXT:    store double [[FPEXT_F32_TO_F64]], ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
+; GCN-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 4
+; GCN-NEXT:    store double 2.000000e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 8
+; GCN-NEXT:    store i32 [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
+; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8
+; GCN-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8
+; GCN-NEXT:    store <2 x i32> <i32 8, i32 234>, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], i32 8
+; GCN-NEXT:    store i32 [[TMP6]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 4
+; GCN-NEXT:    br label [[TMP3]]
+; GCN:       6:
+; GCN-NEXT:    ret void
+;
+  %fpext.f32.to.f64 = fpext float %f32.1 to double
+  %call1 = call i32 @printf(ptr addrspace(4) @format.str.F, float %f32.0, double %f64, double %fpext.f32.to.f64, float 1.0, double 2.0, i16 %i16, i32 %i32, i64 %i64, <2 x float> <float 1.0, float 2.0>, <2 x i32> <i32 8, i32 234>, half %f16)
+  ret void
+}
+
+define amdgpu_kernel void @format_str_a(float %f32.0, double %f64, float %f32.1, i16 %i16, i32 %i32, i64 %i64, half %f16) {
+; R600-LABEL: @format_str_a(
+; R600-NEXT:    [[FPEXT_F32_TO_F64:%.*]] = fpext float [[F32_1:%.*]] to double
+; R600-NEXT:    [[CALL1:%.*]] = call i32 @printf(ptr addrspace(4) @format.str.a, float [[F32_0:%.*]], double [[F64:%.*]], double [[FPEXT_F32_TO_F64]], float 1.000000e+00, double 2.000000e+00, i16 [[I16:%.*]], i32 [[I32:%.*]], i64 [[I64:%.*]], <2 x float> <float 1.000000e+00, float 2.000000e+00>, <2 x i32> <i32 8, i32 234>, half [[F16:%.*]])
+; R600-NEXT:    ret void
+;
+; GCN-LABEL: @format_str_a(
+; GCN-NEXT:    [[FPEXT_F32_TO_F64:%.*]] = fpext float [[F32_1:%.*]] to double
+; GCN-NEXT:    [[TMP1:%.*]] = sext i16 [[I16:%.*]] to i32
+; GCN-NEXT:    [[TMP2:%.*]] = bitcast half [[F16:%.*]] to i16
+; GCN-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
+; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
+; GCN:       .split:
+; GCN-NEXT:    [[TMP4:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+; GCN-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]]
+; GCN:       5:
+; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
+; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
+; GCN-NEXT:    store i32 3, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+; GCN-NEXT:    store float [[F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
+; GCN-NEXT:    store double [[F64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+; GCN-NEXT:    store double [[FPEXT_F32_TO_F64]], ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
+; GCN-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 4
+; GCN-NEXT:    store double 2.000000e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 8
+; GCN-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
+; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8
+; GCN-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8
+; GCN-NEXT:    store <2 x i32> <i32 8, i32 234>, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], i32 8
+; GCN-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 4
+; GCN-NEXT:    br label [[TMP6]]
+; GCN:       6:
+; GCN-NEXT:    ret void
+;
+  %fpext.f32.to.f64 = fpext float %f32.1 to double
+  %call1 = call i32 @printf(ptr addrspace(4) @format.str.a, float %f32.0, double %f64, double %fpext.f32.to.f64, float 1.0, double 2.0, i16 %i16, i32 %i32, i64 %i64, <2 x float> <float 1.0, float 2.0>, <2 x i32> <i32 8, i32 234>, half %f16)
+  ret void
+}
+
+define amdgpu_kernel void @format_str_A(float %f32.0, double %f64, float %f32.1, i16 %i16, i32 %i32, i64 %i64, half %f16) {
+; R600-LABEL: @format_str_A(
+; R600-NEXT:    [[FPEXT_F32_TO_F64:%.*]] = fpext float [[F32_1:%.*]] to double
+; R600-NEXT:    [[CALL1:%.*]] = call i32 @printf(ptr addrspace(4) @format.str.A, float [[F32_0:%.*]], double [[F64:%.*]], double [[FPEXT_F32_TO_F64]], float 1.000000e+00, double 2.000000e+00, i16 [[I16:%.*]], i32 [[I32:%.*]], i64 [[I64:%.*]], <2 x float> <float 1.000000e+00, float 2.000000e+00>, <2 x i32> <i32 8, i32 234>, half [[F16:%.*]])
+; R600-NEXT:    ret void
+;
+; GCN-LABEL: @format_str_A(
+; GCN-NEXT:    [[FPEXT_F32_TO_F64:%.*]] = fpext float [[F32_1:%.*]] to double
+; GCN-NEXT:    [[TMP4:%.*]] = sext i16 [[I16:%.*]] to i32
+; GCN-NEXT:    [[TMP5:%.*]] = bitcast half [[F16:%.*]] to i16
+; GCN-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
+; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
+; GCN:       .split:
+; GCN-NEXT:    [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+; GCN-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
+; GCN:       5:
+; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
+; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
+; GCN-NEXT:    store i32 4, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+; GCN-NEXT:    store float [[F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
+; GCN-NEXT:    store double [[F64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+; GCN-NEXT:    store double [[FPEXT_F32_TO_F64]], ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
+; GCN-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 4
+; GCN-NEXT:    store double 2.000000e+00, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 8
+; GCN-NEXT:    store i32 [[TMP4]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
+; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8
+; GCN-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8
+; GCN-NEXT:    store <2 x i32> <i32 8, i32 234>, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR9:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], i32 8
+; GCN-NEXT:    store i32 [[TMP6]], ptr addrspace(1) [[PRINTBUFFNEXTPTR9]], align 4
+; GCN-NEXT:    br label [[TMP3]]
+; GCN:       6:
+; GCN-NEXT:    ret void
+;
+  %fpext.f32.to.f64 = fpext float %f32.1 to double
+  %call1 = call i32 @printf(ptr addrspace(4) @format.str.A, float %f32.0, double %f64, double %fpext.f32.to.f64, float 1.0, double 2.0, i16 %i16, i32 %i32, i64 %i64, <2 x float> <float 1.0, float 2.0>, <2 x i32> <i32 8, i32 234>, half %f16)
+  ret void
+}
+
 define void @format_str_ptr(ptr %ptr.flat, ptr addrspace(3) %ptr.lds, ptr addrspace(1) %ptr.global, ptr addrspace(5) %ptr.stack, ptr addrspace(4) %ptr.const) {
 ; R600-LABEL: @format_str_ptr(
 ; R600-NEXT:    [[CALL:%.*]] = call i32 @printf(ptr addrspace(4) @format.str.p, ptr [[PTR_FLAT:%.*]], ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PTR_GLOBAL:%.*]], ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(4) [[PTR_CONST:%.*]])
@@ -116,7 +272,7 @@ define void @format_str_ptr(ptr %ptr.flat, ptr addrspace(3) %ptr.lds, ptr addrsp
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 2, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 5, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store ptr [[PTR_FLAT:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 8
@@ -153,7 +309,7 @@ define amdgpu_kernel void @format_str_d(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN:       6:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 3, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 6, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -200,7 +356,7 @@ define amdgpu_kernel void @format_str_u(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN:       6:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 4, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 7, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -247,7 +403,7 @@ define void @format_str_v1(<1 x float> %v1f32.0, <1 x float> %v1f32.1, <1 x doub
 ; GCN:       4:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 5, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 8, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store <1 x float> [[V1F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -291,7 +447,7 @@ define void @format_str_v2(<2 x float> %v2f32.0, <2 x float> %v2f32.1, <2 x doub
 ; GCN:       3:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 6, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 9, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store <2 x float> [[V2F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 8
@@ -334,7 +490,7 @@ define void @format_str_v3(<3 x float> %v3f32.0, <3 x float> %v3f32.1, <3 x doub
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 7, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 10, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store <3 x float> [[V3F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 16
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 16
@@ -377,7 +533,7 @@ define void @format_str_v4(<4 x float> %v4f32.0, <4 x float> %v4f32.1, <4 x doub
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 8, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 11, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store <4 x float> [[V4F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 16
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 16
@@ -420,7 +576,7 @@ define void @format_str_v8(<8 x float> %v8f32.0, <8 x float> %v8f32.1, <8 x doub
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 9, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 12, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store <8 x float> [[V8F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 32
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 32
@@ -463,7 +619,7 @@ define void @format_str_v16(<16 x float> %v16f32.0, <16 x float> %v16f32.1, <16
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 10, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 13, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store <16 x float> [[V16F32_0:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 64
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 64
@@ -508,7 +664,7 @@ define amdgpu_kernel void @test_kernel(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 11, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 14, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -539,7 +695,7 @@ define amdgpu_kernel void @test_format_str_no_null_terminator(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 12, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 15, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 [[N:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    br label [[TMP2]]
@@ -568,7 +724,7 @@ define amdgpu_kernel void @test_indexed_format_str(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 13, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 16, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 [[N:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    br label [[TMP2]]
@@ -596,7 +752,7 @@ define amdgpu_kernel void @test_indexed_format_str_oob(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 14, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 17, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    br label [[TMP2]]
 ; GCN:       2:
@@ -623,7 +779,7 @@ define amdgpu_kernel void @string_pointee_type(i32 %n) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 15, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 18, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -651,7 +807,7 @@ define amdgpu_kernel void @string_address_space4(i32 %n, ptr addrspace(4) %str)
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 16, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 19, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -678,7 +834,7 @@ define amdgpu_kernel void @string_address_space1(i32 %n, ptr addrspace(1) %str)
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 17, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 20, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -705,7 +861,7 @@ define amdgpu_kernel void @string_format_passed_i32(i32 %n, i32 %str) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 18, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 21, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 [[STR:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -741,7 +897,7 @@ define amdgpu_kernel void @test_kernel_addrspacecasted_format_str(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 19, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 22, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -820,7 +976,7 @@ define amdgpu_kernel void @undef_initializer_gv(i32 %n) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 20, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 23, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -849,7 +1005,7 @@ define amdgpu_kernel void @poison_initializer_gv(i32 %n) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 21, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 24, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -878,7 +1034,7 @@ define amdgpu_kernel void @zero_initializer_gv(i32 %n) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 22, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 25, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -905,7 +1061,7 @@ define amdgpu_kernel void @test_print_string_literal_size0(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 23, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 26, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -935,7 +1091,7 @@ define amdgpu_kernel void @test_print_string_literal_1ai8.zero(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 24, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 27, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -965,7 +1121,7 @@ define amdgpu_kernel void @test_print_string_literal_1ai8.undef(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 25, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 28, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -995,7 +1151,7 @@ define amdgpu_kernel void @test_print_string_literal_i8.zero(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 26, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 29, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1025,7 +1181,7 @@ define amdgpu_kernel void @test_print_string_literal_size2(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 27, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 30, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 27, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1055,7 +1211,7 @@ define amdgpu_kernel void @test_print_string_literal_size3(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 28, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 31, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 7195, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1085,7 +1241,7 @@ define amdgpu_kernel void @test_print_string_literal_size3_zero(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 29, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 32, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1115,7 +1271,7 @@ define amdgpu_kernel void @test_print_string_literal_size4(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 30, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 33, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 6513249, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1145,7 +1301,7 @@ define amdgpu_kernel void @test_print_string_literal_size4_nonull_term(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 31, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 34, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1684234849, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1175,7 +1331,7 @@ define amdgpu_kernel void @test_print_string_literal_size5(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 32, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 35, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1684234849, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1205,7 +1361,7 @@ define amdgpu_kernel void @test_print_string_literal_size6(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 33, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 36, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1684234849, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1237,7 +1393,7 @@ define amdgpu_kernel void @test_print_string_literal_size7(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 34, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 37, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1684234849, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1269,7 +1425,7 @@ define amdgpu_kernel void @test_print_string_literal_size8(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 35, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 38, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1953722977, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1301,7 +1457,7 @@ define amdgpu_kernel void @test_print_string_literal_size9(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 36, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 39, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1684104548, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1333,7 +1489,7 @@ define amdgpu_kernel void @test_print_string_literal_size16(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 37, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 40, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 909194801, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1369,7 +1525,7 @@ define amdgpu_kernel void @test_print_string_literal_size17(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 38, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 41, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 925972273, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1405,7 +1561,7 @@ define amdgpu_kernel void @test_print_string_literal_size20(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 39, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 42, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1684234849, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1443,7 +1599,7 @@ define amdgpu_kernel void @test_print_string_literal_size32(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 40, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 43, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1953722977, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1487,7 +1643,7 @@ define amdgpu_kernel void @test_print_string_not_constant_global(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 41, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 44, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1517,7 +1673,7 @@ define amdgpu_kernel void @test_print_string_constant_interposable_global(i32 %n
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 42, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 45, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1547,7 +1703,7 @@ define amdgpu_kernel void @test_print_string_literal_v4i8(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 43, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 46, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 67305985, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1577,7 +1733,7 @@ define amdgpu_kernel void @test_print_string_literal_v4i32(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 44, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 47, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1607,7 +1763,7 @@ define amdgpu_kernel void @test_print_string_literal_struct(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 45, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 48, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 66, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1637,7 +1793,7 @@ define amdgpu_kernel void @test_print_string_undef(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 46, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 49, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1667,7 +1823,7 @@ define amdgpu_kernel void @test_print_string_poison(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 47, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 50, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1697,7 +1853,7 @@ define amdgpu_kernel void @test_print_string_null(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 48, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 51, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1727,7 +1883,7 @@ define amdgpu_kernel void @test_print_string_inttoptr(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 49, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 52, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 4144959, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1757,7 +1913,7 @@ define amdgpu_kernel void @test_print_string_float_neg0(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 50, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 53, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1787,7 +1943,7 @@ define amdgpu_kernel void @test_print_string_float_0(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 51, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 54, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1817,7 +1973,7 @@ define amdgpu_kernel void @test_print_string_ptr_null(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 52, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 55, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1847,7 +2003,7 @@ define amdgpu_kernel void @test_print_string_ptr_undef(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 53, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 56, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1877,7 +2033,7 @@ define amdgpu_kernel void @test_print_string_indexed(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 54, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 57, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 1936875892, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1913,7 +2069,7 @@ define amdgpu_kernel void @test_print_string_indexed_oob(i32 %n) {
 ; GCN:       1:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 55, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 58, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store i32 -256, ptr addrspace(1) [[PRINTBUFFGEP]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 4
@@ -1946,7 +2102,7 @@ define amdgpu_kernel void @test_format_array_i16(i32 %n) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 56, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 59, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -1975,7 +2131,7 @@ define amdgpu_kernel void @test_format_struct(i32 %n) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 57, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 60, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -2015,7 +2171,7 @@ define void @printf_printf(i32 %n) {
 ; GCN:       2:
 ; GCN-NEXT:    [[PRINTBUFFID:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 0
 ; GCN-NEXT:    [[PRINTBUFFIDCAST:%.*]] = bitcast ptr addrspace(1) [[PRINTBUFFID]] to ptr addrspace(1)
-; GCN-NEXT:    store i32 58, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
+; GCN-NEXT:    store i32 61, ptr addrspace(1) [[PRINTBUFFIDCAST]], align 4
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store ptr @printf, ptr addrspace(1) [[PRINTBUFFGEP]], align 8
 ; GCN-NEXT:    br label [[TMP3]]
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 663fd98b46bf7..ce96766116089 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -17,9 +17,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-  ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -42,8 +42,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
-  ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-  ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+  ; PEI-GFX908-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; PEI-GFX908-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
   ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
@@ -62,9 +62,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-  ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
@@ -85,8 +85,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-  ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
+  ; PEI-GFX90A-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; PEI-GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
   ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index afbbb05bfc3a5..764a1e1090181 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefixes=GCN,GFX942 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 -run-pass peephole-opt -o - %s | FileCheck -check-prefixes=GCN,GFX1250 %s
 
 ---
 name:            fold_simm_virtual
@@ -564,6 +565,144 @@ body:             |
 
 ...
 
+---
+name:            fmac_sreg_64_src0_to_fmamk_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GFX942-LABEL: name: fmac_sreg_64_src0_to_fmamk_f64
+    ; GFX942: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    ; GFX942-NEXT: [[V_FMAC_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_FMAC_F64_e64 0, [[S_MOV_B]], 0, [[DEF]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: SI_RETURN_TO_EPILOG [[V_FMAC_F64_e64_]]
+    ;
+    ; GFX1250-LABEL: name: fmac_sreg_64_src0_to_fmamk_f64
+    ; GFX1250: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[V_FMAMK_F64_:%[0-9]+]]:vreg_64_align2 = V_FMAMK_F64 [[DEF]], 1311768467750121200, [[DEF1]], implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F64_]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vreg_64_align2 = V_FMAC_F64_e64 0, %2, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fmac_sreg_64_src1_to_fmamk_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fmac_sreg_64_src1_to_fmamk_f64
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    ; GCN-NEXT: [[V_FMAC_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_FMAC_F64_e64 0, [[DEF]], 0, [[DEF1]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAC_F64_e64_]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vreg_64_align2 = V_FMAC_F64_e64 0, %0, 0, %1, 0, %1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fmac_vreg_64_to_fmaak_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GFX942-LABEL: name: fmac_vreg_64_to_fmaak_f64
+    ; GFX942: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GFX942-NEXT: [[V_FMAC_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_FMAC_F64_e64 0, [[DEF]], 0, [[DEF1]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: SI_RETURN_TO_EPILOG [[V_FMAC_F64_e64_]]
+    ;
+    ; GFX1250-LABEL: name: fmac_vreg_64_to_fmaak_f64
+    ; GFX1250: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[V_FMAAK_F64_:%[0-9]+]]:vreg_64_align2 = V_FMAAK_F64 [[DEF]], [[DEF1]], 1311768467750121200, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F64_]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    %3:vreg_64_align2 = V_FMAC_F64_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fma_sreg_64_src0_to_fmamk_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GFX942-LABEL: name: fma_sreg_64_src0_to_fmamk_f64
+    ; GFX942: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    ; GFX942-NEXT: [[V_FMA_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_FMA_F64_e64 0, [[S_MOV_B]], 0, [[DEF]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: SI_RETURN_TO_EPILOG [[V_FMA_F64_e64_]]
+    ;
+    ; GFX1250-LABEL: name: fma_sreg_64_src0_to_fmamk_f64
+    ; GFX1250: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[V_FMAMK_F64_:%[0-9]+]]:vreg_64_align2 = V_FMAMK_F64 [[DEF]], 1311768467750121200, [[DEF1]], implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F64_]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vreg_64_align2 = V_FMA_F64_e64 0, %2, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fma_sreg_64_src1_to_fmamk_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fma_sreg_64_src1_to_fmamk_f64
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    ; GCN-NEXT: [[V_FMA_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_FMA_F64_e64 0, [[DEF]], 0, [[DEF1]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMA_F64_e64_]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
+    %3:vreg_64_align2 = V_FMA_F64_e64 0, %0, 0, %1, 0, %1, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
+---
+name:            fma_vreg_64_to_fmaak_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GFX942-LABEL: name: fma_vreg_64_to_fmaak_f64
+    ; GFX942: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX942-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GFX942-NEXT: [[V_FMA_F64_e64_:%[0-9]+]]:vreg_64_align2 = V_FMA_F64_e64 0, [[DEF]], 0, [[DEF1]], 0, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: SI_RETURN_TO_EPILOG [[V_FMA_F64_e64_]]
+    ;
+    ; GFX1250-LABEL: name: fma_vreg_64_to_fmaak_f64
+    ; GFX1250: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GFX1250-NEXT: [[V_FMAAK_F64_:%[0-9]+]]:vreg_64_align2 = V_FMAAK_F64 [[DEF]], [[DEF1]], 1311768467750121200, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F64_]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    %3:vreg_64_align2 = V_FMA_F64_e64 0, %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN_TO_EPILOG %3
+...
+
 ---
 name:            fold_v_mov_b32_e32_literal_to_agpr
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
index c2c5340639a16..8145a1d7a2072 100644
--- a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
+++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
@@ -167,3 +167,59 @@ body:             |
     %1:sreg_32 = COPY %0
     S_BRANCH %bb.2
 ...
+
+---
+
+name:            phi_moveimm_av_pseudo_input
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: phi_moveimm_av_pseudo_input
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY1]], implicit-def $scc
+  ; GCN-NEXT:   S_BRANCH %bb.2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI %5, %bb.3, [[S_ADD_U32_]], %bb.1
+  ; GCN-NEXT:   S_BRANCH %bb.3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   S_BRANCH %bb.2
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr0, $sgpr1
+
+    %0:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+
+    %4:sreg_32 = COPY $sgpr0
+    %5:sreg_32 = COPY $sgpr1
+
+  bb.1:
+    successors: %bb.2
+    %2:sreg_32 = S_ADD_U32 %4, %5, implicit-def $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %3:sreg_32 = PHI %1, %bb.3, %2, %bb.1
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.2
+    %1:sreg_32 = COPY %0
+    S_BRANCH %bb.2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 1ec94162951a6..d48bfe0bb7f21 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -145,49 +145,29 @@ entry:
 
 ; Test skipping the lower-32-bit addition if it is unnecessary.
 define ptr @huge_offset_low_32_unused(ptr %p) {
-; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    s_mov_b32 s0, 0
-; GFX942_PTRADD-NEXT:    s_mov_b32 s1, 1
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_add_u32_e32 v1, 1, v1
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: huge_offset_low_32_unused:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
   ret ptr %gep
 }
 
 ; Reassociate address computation if it leads to more scalar operations.
 define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_r:
-; GFX942_PTRADD:       ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT:    s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_r:
-; GFX942_LEGACY:       ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT:    s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT:    s_endpgm
+; GFX942-LABEL: reassoc_scalar_r:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_u32 s2, s2, s6
+; GFX942-NEXT:    s_addc_u32 s3, s3, s7
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
   %voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
 }
 
 define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_l:
-; GFX942_PTRADD:       ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT:    s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_l:
-; GFX942_LEGACY:       ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT:    s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT:    s_endpgm
+; GFX942-LABEL: reassoc_scalar_l:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_u32 s2, s2, s6
+; GFX942-NEXT:    s_addc_u32 s3, s3, s7
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
   %voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
 
 ; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
 define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
-; GFX942_PTRADD-LABEL: shl_neg_offset:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_sub_co_u32_e32 v2, vcc, 0, v2
-; GFX942_PTRADD-NEXT:    s_nop 1
-; GFX942_PTRADD-NEXT:    v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: shl_neg_offset:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_LEGACY-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX942_LEGACY-NEXT:    s_nop 1
-; GFX942_LEGACY-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: shl_neg_offset:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %offset = sub i64 0, %noffset
   %x = shl i64 %offset, %shift
   %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
 ; GFX942_PTRADD:       ; %bb.0:
 ; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942_PTRADD-NEXT:    s_getpc_b64 s[0:1]
-; GFX942_PTRADD-NEXT:    s_add_u32 s0, s0, v0@rel32@lo+4
-; GFX942_PTRADD-NEXT:    s_addc_u32 s1, s1, v0@rel32@hi+12
+; GFX942_PTRADD-NEXT:    s_add_u32 s0, s0, v0@rel32@lo+14
+; GFX942_PTRADD-NEXT:    s_addc_u32 s1, s1, v0@rel32@hi+22
 ; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 10
 ; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,30 +248,63 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
 
 ; Tests the tryFoldToMad64_32 PTRADD combine.
 define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
-; GFX942_PTRADD-LABEL: fold_mad64:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT:    v_mul_hi_u32_u24_e32 v1, 12, v0
-; GFX942_PTRADD-NEXT:    v_mul_u32_u24_e32 v0, 12, v0
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v2, 1.0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT:    global_store_dword v[0:1], v2, off
-; GFX942_PTRADD-NEXT:    s_endpgm
-;
-; GFX942_LEGACY-LABEL: fold_mad64:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v2, 1.0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
-; GFX942_LEGACY-NEXT:    global_store_dword v[0:1], v2, off
-; GFX942_LEGACY-NEXT:    s_endpgm
+; GFX942-LABEL: fold_mad64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 1.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_endpgm
   %voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
   %voffset = zext i32 %voffset32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0
   store float 1.0, ptr addrspace(1) %p1
   ret void
 }
+
+; Use non-zero shift amounts in v_lshl_add_u64.
+define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) {
+; GFX942_PTRADD-LABEL: select_v_lshl_add_u64:
+; GFX942_PTRADD:       ; %bb.0:
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: select_v_lshl_add_u64:
+; GFX942_LEGACY:       ; %bb.0:
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1]
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr inbounds i64, ptr %base, i64 %voffset
+  ret ptr %gep
+}
+
+; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the
+; mul into a mul24.
+define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
+; GFX942_PTRADD-LABEL: fold_mul24_into_mad:
+; GFX942_PTRADD:       ; %bb.0:
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_and_b32_e32 v2, 0xfffff, v2
+; GFX942_PTRADD-NEXT:    v_and_b32_e32 v4, 0xfffff, v4
+; GFX942_PTRADD-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v4
+; GFX942_PTRADD-NEXT:    v_mul_u32_u24_e32 v2, v2, v4
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: fold_mul24_into_mad:
+; GFX942_LEGACY:       ; %bb.0:
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_and_b32_e32 v2, 0xfffff, v2
+; GFX942_LEGACY-NEXT:    v_and_b32_e32 v3, 0xfffff, v4
+; GFX942_LEGACY-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1]
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+  %a_masked = and i64 %a, u0xfffff
+  %b_masked = and i64 %b, u0xfffff
+  %mul = mul i64 %a_masked, %b_masked
+  %gep = getelementptr inbounds i8, ptr %base, i64 %mul
+  ret ptr %gep
+}
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir
new file mode 100644
index 0000000000000..d27b4eaff1ed9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs --start-before=greedy,2 --stop-after=greedy,2 %s -o - | FileCheck %s
+
+# Make sure there's no machine verifier error
+
+# If RA is unable to find a register to allocate, then cleanupFailedVReg will do ad-hoc rewriting and will insert undefs to make the LiveRanges workable.
+# %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 is an example of such a rewrite / undef. If we were to want to spill %30, we should not be inserting
+# actual spill code, as the source operand is undef.
+# Check that there are no verfier issues with the LiveRange of $vgpr0_vgpr1_vgpr2_vgpr3 / that we do not insert spill code for %30.
+
+
+--- |
+  define void @foo() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+
+...
+
+---
+name:            foo
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, size: 32, alignment: 4 }
+machineFunctionInfo:
+  maxKernArgAlign: 4
+  isEntryFunction: true
+  waveLimiter:     true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+  hasSpilledVGPRs: true
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: foo
+    ; CHECK: INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10 /* regdef */, def %10, 10 /* regdef */, def %1, 10 /* regdef */, def %2, 10 /* regdef */, def $vgpr0_vgpr1_vgpr2_vgpr3, 10 /* regdef */, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK-NEXT: KILL undef $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: SI_SPILL_AV160_SAVE %2, %stack.1, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_AV256_SAVE %1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY %10
+    ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK-NEXT: SI_SPILL_AV512_SAVE [[COPY1]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY [[SI_SPILL_AV512_RESTORE]]
+    ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_AV512_SAVE [[SI_SPILL_V512_RESTORE]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE:%[0-9]+]]:av_256 = SI_SPILL_AV256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_AV256_SAVE [[SI_SPILL_AV256_RESTORE]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV160_RESTORE:%[0-9]+]]:vreg_160 = SI_SPILL_AV160_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE1:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE1:%[0-9]+]]:av_256 = SI_SPILL_AV256_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9 /* reguse */, [[SI_SPILL_AV512_RESTORE1]], 9 /* reguse */, [[SI_SPILL_AV256_RESTORE1]], 9 /* reguse */, [[SI_SPILL_AV160_RESTORE]], 9 /* reguse */, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9 /* reguse */, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16
+    ; CHECK-NEXT: SI_RETURN
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10, def %22:vreg_512, 10, def %25:vreg_256, 10, def %28:vreg_160, 10, def $vgpr0_vgpr1_vgpr2_vgpr3, 10, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3
+    %27:av_160 = COPY %28:vreg_160
+    %24:av_256 = COPY %25:vreg_256
+    SI_SPILL_V512_SAVE %22:vreg_512, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+    %18:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY %18:vreg_512
+    %23:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+    %26:vreg_256 = COPY %24:av_256
+    %29:vreg_160 = COPY %27:av_160
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %30:av_128
+    INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9, %23:vreg_512, 9, %26:vreg_256, 9, %29:vreg_160, 9, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16
+    SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index ba9dd8f7c2468..5d0e4bf1d34d0 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -559,16 +559,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
@@ -1943,16 +1946,19 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
 ; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[12:13]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s11
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[8:9]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll
new file mode 100644
index 0000000000000..11af704d30973
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll
@@ -0,0 +1,1429 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX1010 %s
+
+; Test the CMP+SELECT optimization that folds shared constants to reduce
+; register pressure.
+
+;------------------------------------------------------------------------------
+; F32 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: fcmp oeq + select with constant in true value
+define float @fcmp_select_fold_oeq_f32_imm(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_f32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_f32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float 0x40490FDB00000000, float %other
+  ret float %sel
+}
+
+; Should be folded: fcmp oeq + select with constant in true value (commutative)
+define float @fcmp_select_fold_oeq_imm_f32(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_f32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float 0x40490FDB00000000, %arg
+  %sel = select i1 %cmp, float 0x40490FDB00000000, float %other
+  ret float %sel
+}
+
+; Should be folded: fcmp one + select with constant in false value
+define float @fcmp_select_fold_one_f32_imm(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_one_f32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x402df850
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_f32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x402df850, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float %arg, 0x4005BF0A00000000
+  %sel = select i1 %cmp, float %other, float 0x4005BF0A00000000
+  ret float %sel
+}
+
+; Should be folded: fcmp one + select with constant in false value (commutative)
+define float @fcmp_select_fold_one_imm_f32(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_f32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x402df850
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_f32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x402df850, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float 0x4005BF0A00000000, %arg
+  %sel = select i1 %cmp, float %other, float 0x4005BF0A00000000
+  ret float %sel
+}
+
+; Should NOT be folded: different constants
+define float @fcmp_select_no_fold_f32_different_const(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x46487ed8
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x46487ed8, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float 0x40C90FDB00000000, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp oeq with constant in other position
+define float @fcmp_select_no_fold_f32_other_pos(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x42487ed8
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x42487ed8, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float %other, float 0x40490FDB00000000
+  ret float %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define float @fcmp_select_no_fold_f32_unsupported_cmp(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x42487ed8
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x42487ed8
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x42487ed8, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x42487ed8, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt float %arg, 0x40490FDB00000000
+  %sel = select i1 %cmp, float %other, float 0x40490FDB00000000
+  ret float %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define float @fcmp_select_no_fold_f32_enc_imm(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, 1.0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 1.0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 1.0
+  %sel = select i1 %cmp, float 1.0, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define float @fcmp_select_no_fold_f32_enc_imm_2(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f32_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, -4.0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f32_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, -4.0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float -4.0, %arg
+  %sel = select i1 %cmp, float %other, float -4.0
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp oeq with zero constant
+define float @fcmp_select_no_fold_oeq_f32_zero(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f32_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f32_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0.0
+  %sel = select i1 %cmp, float 0.0, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp one with negative zero constant
+define float @fcmp_select_no_fold_one_f32_negzero(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f32_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_brev_b32 s4, 1
+; GFX900-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f32_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x80000000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x80000000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float -0.0, %arg ; 0x8000000000000000
+  %sel = select i1 %cmp, float %other, float -0.0 ;0x8000000000000000
+  ret float %sel
+}
+
+; NaN values should bypass the optimization due to special IEEE 754 behavior
+; fcmp oeq with NaN always returns false, so select always chooses %other
+define float @fcmp_select_no_fold_oeq_f32_nan(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f32_nan:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f32_nan:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x7FF8000000000000
+  %sel = select i1 %cmp, float 0x7FF8000000000000, float %other
+  ret float %sel
+}
+
+; NaN values should bypass the optimization due to special IEEE 754 behavior
+; fcmp one with NaN always returns false, so select always chooses the NaN constant
+define float @fcmp_select_no_fold_one_f32_nan(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f32_nan:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f32_nan:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float 0x7FF8000000000000, %arg
+  %sel = select i1 %cmp, float %other, float 0x7FF8000000000000
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp one with positive infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define float @fcmp_select_no_fold_posinf_oeq_f32(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_oeq_f32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_oeq_f32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0x7f800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq float %arg, 0x7FF0000000000000
+  %sel = select i1 %cmp, float 0x7FF0000000000000, float %other
+  ret float %sel
+}
+
+; Should NOT be folded: fcmp one with negative infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define float @fcmp_select_no_fold_neginf_f32_one(float %arg, float %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f32_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xff800000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f32_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0xff800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xff800000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one float 0xFFF0000000000000, %arg
+  %sel = select i1 %cmp, float %other, float 0xFFF0000000000000
+  ret float %sel
+}
+
+;------------------------------------------------------------------------------
+; F64 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: f64 fcmp oeq + select with constant in true value
+define double @fcmp_select_fold_oeq_f64_imm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_f64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_f64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double 3.141592653589793, double %other
+  ret double %sel
+}
+; Should be folded: f64 fcmp oeq + select with constant in true value (commutative)
+define double @fcmp_select_fold_oeq_imm_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 3.141592653589793, %arg
+  %sel = select i1 %cmp, double 3.141592653589793, double %other
+  ret double %sel
+}
+
+; Should be folded: f64 fcmp one + select with constant in false value
+define double @fcmp_select_fold_one_f64_imm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_one_f64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX900-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_f64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX1010-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double %arg, 2.718281828459045
+  %sel = select i1 %cmp, double %other, double 2.718281828459045
+  ret double %sel
+}
+; Should be folded: f64 fcmp one + select with constant in false value (commutative)
+define double @fcmp_select_fold_one_imm_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX900-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x8b145769
+; GFX1010-NEXT:    s_mov_b32 s5, 0x4005bf0a
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double 2.718281828459045, %arg
+  %sel = select i1 %cmp, double %other, double 2.718281828459045
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with constant in other position
+define double @fcmp_select_no_fold_f64_other_pos(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x54442d18
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x400921fb
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x54442d18, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x400921fb, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double %other, double 3.141592653589793
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp unsupported comparison type
+define double @fcmp_select_no_fold_f64_unsupported_cmp(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x54442d18
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x400921fb
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_gt_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x54442d18, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x400921fb, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double %other, double 3.141592653589793
+  ret double %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define double @fcmp_select_no_fold_f64_enc_imm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, 1.0, v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 1.0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x3ff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 1.0
+  %sel = select i1 %cmp, double 1.0, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: imm can be encoded into cndmask
+define double @fcmp_select_no_fold_f64_enc_imm_2(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, -4.0, v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0xc0100000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, -4.0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0xc0100000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double -4.0, %arg
+  %sel = select i1 %cmp, double %other, double -4.0
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with zero constant
+define double @fcmp_select_no_fold_oeq_f64_zero(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f64_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f64_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0.0
+  %sel = select i1 %cmp, double 0.0, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp one with negative zero constant
+define double @fcmp_select_no_fold_one_f64_negzero(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f64_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_brev_b32 s5, 1
+; GFX900-NEXT:    v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f64_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f64_e32 vcc_lo, 0x80000000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x80000000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double -0.0, %arg
+  %sel = select i1 %cmp, double %other, double -0.0
+  ret double %sel
+}
+
+; Should NOT be folded: f64 different constants
+define double @fcmp_select_no_fold_f64_different_const(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f64_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX900-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x8b145769
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4005bf0a
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f64_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0x54442d18
+; GFX1010-NEXT:    s_mov_b32 s5, 0x400921fb
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x8b145769, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x4005bf0a, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 3.141592653589793
+  %sel = select i1 %cmp, double 2.718281828459045, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with NaN constant
+; fcmp oeq with NaN always returns false, so select always chooses %other
+define double @fcmp_select_no_fold_nan_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0x7FF8000000000000
+  %sel = select i1 %cmp, double 0x7FF8000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with NaN constant (commutative variant)
+; fcmp oeq with NaN always returns false, so select always chooses %other
+define double @fcmp_select_no_fold_nan_f64_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 0x7FF8000000000000, %arg
+  %sel = select i1 %cmp, double 0x7FF8000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp one with NaN constant
+; fcmp one with NaN always returns false, so select always chooses the NaN constant
+define double @fcmp_select_no_fold_nan_f64_one(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double %arg, 0x7FF8000000000000
+  %sel = select i1 %cmp, double %other, double 0x7FF8000000000000
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp one with NaN constant (commutative variant)
+; fcmp one with NaN always returns false, so select always chooses the NaN constant
+define double @fcmp_select_no_fold_nan_f64_one_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f64_one_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_one_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one double 0x7FF8000000000000, %arg
+  %sel = select i1 %cmp, double %other, double 0x7FF8000000000000
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with positive infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_posinf_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x7ff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0x7FF0000000000000
+  %sel = select i1 %cmp, double 0x7FF0000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with negative infinity
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_neginf_f64(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0xfff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0xfff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0xfff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0xfff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double %arg, 0xFFF0000000000000
+  %sel = select i1 %cmp, double 0xFFF0000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with positive infinity (commutative variant)
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_posinf_f64_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_f64_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7ff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_f64_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x7ff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 0x7FF0000000000000, %arg
+  %sel = select i1 %cmp, double 0x7FF0000000000000, double %other
+  ret double %sel
+}
+
+; Should NOT be folded: f64 fcmp oeq with negative infinity (commutative variant)
+; Infinity values should bypass the optimization, generating unfolded code
+define double @fcmp_select_no_fold_neginf_f64_comm(double %arg, double %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f64_comm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0
+; GFX900-NEXT:    s_mov_b32 s5, 0xfff00000
+; GFX900-NEXT:    v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0xfff00000
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f64_comm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0xfff00000, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0xfff00000, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq double 0xFFF0000000000000, %arg
+  %sel = select i1 %cmp, double 0xFFF0000000000000, double %other
+  ret double %sel
+}
+
+;------------------------------------------------------------------------------
+; F16 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: f16 fcmp oeq + select with constant in true value
+define half @fcmp_select_fold_oeq_f16_imm(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_f16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_f16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH4248
+  %sel = select i1 %cmp, half 0xH4248, half %other
+  ret half %sel
+}
+
+; Should be folded: f16 fcmp oeq + select with constant in true value (commutative)
+define half @fcmp_select_fold_oeq_imm_f16(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_f16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half 0xH4248, %arg
+  %sel = select i1 %cmp, half 0xH4248, half %other
+  ret half %sel
+}
+
+; Should be folded: f16 fcmp one + select with constant in false value
+define half @fcmp_select_fold_one_f16_imm(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_one_f16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4020
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_f16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x4020, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xH4020
+  %sel = select i1 %cmp, half %other, half 0xH4020
+  ret half %sel
+}
+
+; Should be folded: f16 fcmp one + select with constant in false value (commutative)
+define half @fcmp_select_fold_one_imm_f16(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_f16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4020
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_f16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x4020, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half 0xH4020, %arg
+  %sel = select i1 %cmp, half %other, half 0xH4020
+  ret half %sel
+}
+
+; Should NOT be folded: different constants
+define half @fcmp_select_no_fold_f16_different_const(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f16_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4300
+; GFX900-NEXT:    v_cmp_neq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f16_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4300, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH4248
+  %sel = select i1 %cmp, half 0xH4300, half %other
+  ret half %sel
+}
+
+; Should NOT be folded: NaN values bypass optimization
+define half @fcmp_select_no_fold_nan_f16(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH7e00
+  %sel = select i1 %cmp, half 0xH7e00, half %other
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp one with NaN constant
+define half @fcmp_select_no_fold_nan_f16_one(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_f16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_f16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xH7e00
+  %sel = select i1 %cmp, half %other, half 0xH7e00
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp one with +Inf constant
+define half @fcmp_select_no_fold_posinf_f16_one(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_f16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7c00
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_f16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x7c00, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xH7c00
+  %sel = select i1 %cmp, half %other, half 0xH7c00
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp one with -Inf constant
+define half @fcmp_select_no_fold_neginf_f16_one(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_f16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xfc00
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xfc00
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_f16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0xfc00, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xfc00, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half %arg, 0xHfc00
+  %sel = select i1 %cmp, half %other, half 0xHfc00
+  ret half %sel
+}
+; Should NOT be folded: f16 fcmp oeq with zero constant
+define half @fcmp_select_no_fold_oeq_f16_zero(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_f16_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_f16_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH0000
+  %sel = select i1 %cmp, half 0xH0000, half %other
+  ret half %sel
+}
+; Should NOT be folded: f16 fcmp one with negative zero constant
+define half @fcmp_select_no_fold_one_f16_negzero(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_f16_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0x8000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x8000
+; GFX900-NEXT:    v_cmp_lg_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_f16_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0x8000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x8000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one half 0xH8000, %arg
+  %sel = select i1 %cmp, half %other, half 0xH8000
+  ret half %sel
+}
+
+; Should NOT be folded: f16 fcmp oeq with constant in other position
+define half @fcmp_select_no_fold_f16_other_pos(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f16_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f16_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq half %arg, 0xH4248
+  %sel = select i1 %cmp, half %other, half 0xH4248
+  ret half %sel
+}
+
+; Should NOT be folded: f16 unsupported comparison type
+define half @fcmp_select_no_fold_f16_unsupported_cmp(half %arg, half %other) {
+; GFX900-LABEL: fcmp_select_no_fold_f16_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x4248
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_gt_f16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_f16_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0x4248, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt half %arg, 0xH4248
+  %sel = select i1 %cmp, half %other, half 0xH4248
+  ret half %sel
+}
+
+;------------------------------------------------------------------------------
+; BF16 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: bfloat fcmp oeq + select with constant in true value
+define bfloat @fcmp_select_fold_oeq_bf16_imm(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_bf16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_bf16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat 0xR4248, bfloat %other
+  ret bfloat %sel
+}
+
+; Should be folded: bfloat fcmp oeq + select with constant in true value (commutative)
+define bfloat @fcmp_select_fold_oeq_imm_bf16(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_oeq_imm_bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_oeq_imm_bf16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat 0xR4248, %arg
+  %sel = select i1 %cmp, bfloat 0xR4248, bfloat %other
+  ret bfloat %sel
+}
+
+; Should be folded: bfloat fcmp one + select with constant in false value
+define bfloat @fcmp_select_fold_one_bf16_imm(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_one_bf16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x40200000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_bf16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x40200000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xR4020
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4020
+  ret bfloat %sel
+}
+
+; Should be folded: bfloat fcmp one + select with constant in false value (commutative)
+define bfloat @fcmp_select_fold_one_imm_bf16(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_fold_one_imm_bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x40200000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_fold_one_imm_bf16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x40200000, v2
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat 0xR4020, %arg
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4020
+  ret bfloat %sel
+}
+
+; Should NOT be folded: different constants
+define bfloat @fcmp_select_no_fold_bf16_different_const(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_bf16_different_const:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4300
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_bf16_different_const:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0x42480000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4300, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat 0xR4300, bfloat %other
+  ret bfloat %sel
+}
+
+; Should NOT be folded: NaN values bypass optimization
+define bfloat @fcmp_select_no_fold_nan_bf16(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_bf16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_bf16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, v1
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR7FC0
+  %sel = select i1 %cmp, bfloat 0xR7FC0, bfloat %other
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with NaN constant
+define bfloat @fcmp_select_no_fold_nan_bf16_one(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_nan_bf16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_nan_bf16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v0, 0x7fc0
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xR7FC0
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR7FC0
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with +Inf constant
+define bfloat @fcmp_select_no_fold_posinf_bf16_one(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_posinf_bf16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7f80
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_posinf_bf16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x7f800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7f80, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xR7F80
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR7F80
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with -Inf constant
+define bfloat @fcmp_select_no_fold_neginf_bf16_one(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_neginf_bf16_one:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_neginf_bf16_one:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0xff800000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xffffff80, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat %arg, 0xRFF80
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xRFF80
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp oeq with zero constant
+define bfloat @fcmp_select_no_fold_oeq_bf16_zero(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_oeq_bf16_zero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_oeq_bf16_zero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR0000
+  %sel = select i1 %cmp, bfloat 0xR0000, bfloat %other
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp one with negative zero constant
+define bfloat @fcmp_select_no_fold_one_bf16_negzero(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_one_bf16_negzero:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_brev_b32 s4, 1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX900-NEXT:    v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_one_bf16_negzero:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0x80000000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xffff8000, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp one bfloat 0xR8000, %arg
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR8000
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat fcmp oeq with constant in other position
+define bfloat @fcmp_select_no_fold_bf16_other_pos(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_bf16_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_eq_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_bf16_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp oeq bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4248
+  ret bfloat %sel
+}
+
+; Should NOT be folded: bfloat unsupported comparison type
+define bfloat @fcmp_select_no_fold_bf16_unsupported_cmp(bfloat %arg, bfloat %other) {
+; GFX900-LABEL: fcmp_select_no_fold_bf16_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x42480000
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4248
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: fcmp_select_no_fold_bf16_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1010-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x42480000, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = fcmp olt bfloat %arg, 0xR4248
+  %sel = select i1 %cmp, bfloat %other, bfloat 0xR4248
+  ret bfloat %sel
+}
diff --git a/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll
new file mode 100644
index 0000000000000..4383cfd36f945
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll
@@ -0,0 +1,955 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX1010 %s
+
+;------------------------------------------------------------------------------
+; I32 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i32 @icmp_select_fold_eq_i32_imm(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 4242
+  %sel = select i1 %cmp, i32 4242, i32 %other
+  ret i32 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i32 @icmp_select_fold_eq_imm_i32(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 4242, %arg
+  %sel = select i1 %cmp, i32 4242, i32 %other
+  ret i32 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i32 @icmp_select_fold_ne_i32_imm(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i32_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i32_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i32 %arg, 4242
+  %sel = select i1 %cmp, i32 %other, i32 4242
+  ret i32 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i32 @icmp_select_fold_ne_imm_i32(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i32:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i32:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i32 4242, %arg
+  %sel = select i1 %cmp, i32 %other, i32 4242
+  ret i32 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i32 @icmp_select_no_fold_i32_different(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x978
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x978, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 4242
+  %sel = select i1 %cmp, i32 2424, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i32 @icmp_select_no_fold_i32_other_pos(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 4242
+  %sel = select i1 %cmp, i32 %other, i32 4242
+  ret i32 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i32 @icmp_select_no_fold_i32_unsupported_cmp(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1094
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x102d
+; GFX900-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x1094, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x102d, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i32 %arg, 4243
+  %sel = select i1 %cmp, i32 4141, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i32 @icmp_select_no_fold_i32_enc_imm(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 %arg, 0
+  %sel = select i1 %cmp, i32 0, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i32 @icmp_select_no_fold_i32_enc_imm_2(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 64, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 64, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 64, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 64, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i32 64, %arg
+  %sel = select i1 %cmp, i32 64, i32 %other
+  ret i32 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i32 @icmp_select_no_fold_i32_enc_imm_3(i32 %arg, i32 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, -16, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -16, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -16, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -16, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i32 %arg, -16
+  %sel = select i1 %cmp, i32 %other, i32 -16
+  ret i32 %sel
+}
+
+;------------------------------------------------------------------------------
+; I64 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i64 @icmp_select_fold_eq_i64_imm(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 424242424242, i64 %other
+  ret i64 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i64 @icmp_select_fold_eq_imm_i64(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 424242424242, %arg
+  %sel = select i1 %cmp, i64 424242424242, i64 %other
+  ret i64 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i64 @icmp_select_fold_ne_i64_imm(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i64_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i64_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 %other, i64 424242424242
+  ret i64 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i64 @icmp_select_fold_ne_imm_i64(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i64:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i64:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i64 424242424242, %arg
+  %sel = select i1 %cmp, i64 %other, i64 424242424242
+  ret i64 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i64 @icmp_select_no_fold_i64_different(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0x719c60f8
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x719c60f8, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 242424242424, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i64 @icmp_select_no_fold_i64_other_pos(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0xc6d1a9b2
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x62
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b2
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xc6d1a9b2, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x62, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 %other, i64 424242424242
+  ret i64 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i64 @icmp_select_no_fold_i64_unsupported_cmp(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, 0xc6d1a9b3
+; GFX900-NEXT:    s_movk_i32 s5, 0x62
+; GFX900-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0xc6d1a9b2
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x62
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    s_mov_b32 s4, 0xc6d1a9b3
+; GFX1010-NEXT:    s_movk_i32 s5, 0x62
+; GFX1010-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0xc6d1a9b2, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0x62, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i64 %arg, 424242424242
+  %sel = select i1 %cmp, i64 424242424242, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i64 @icmp_select_no_fold_i64_enc_imm(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 %arg, 0
+  %sel = select i1 %cmp, i64 0, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i64 @icmp_select_no_fold_i64_enc_imm_2(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, 32, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 32, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 32, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 32, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i64 32, %arg
+  %sel = select i1 %cmp, i64 32, i64 %other
+  ret i64 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i64 @icmp_select_no_fold_i64_enc_imm_3(i64 %arg, i64 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u64_e32 vcc, -8, v[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -8, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, -1, v3, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u64_e32 vcc_lo, -8, v[0:1]
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -8, v2, vcc_lo
+; GFX1010-NEXT:    v_cndmask_b32_e32 v1, -1, v3, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i64 %arg, -8
+  %sel = select i1 %cmp, i64 %other, i64 -8
+  ret i64 %sel
+}
+
+;------------------------------------------------------------------------------
+; I16 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i16 @icmp_select_fold_eq_i16_imm(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 4242
+  %sel = select i1 %cmp, i16 4242, i16 %other
+  ret i16 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i16 @icmp_select_fold_eq_imm_i16(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 4242, %arg
+  %sel = select i1 %cmp, i16 4242, i16 %other
+  ret i16 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i16 @icmp_select_fold_ne_i16_imm(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i16_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i16_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i16 %arg, 4242
+  %sel = select i1 %cmp, i16 %other, i16 4242
+  ret i16 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i16 @icmp_select_fold_ne_imm_i16(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i16:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i16:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i16 4242, %arg
+  %sel = select i1 %cmp, i16 %other, i16 4242
+  ret i16 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i16 @icmp_select_no_fold_i16_different(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x978
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x978, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 4242
+  %sel = select i1 %cmp, i16 2424, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i16 @icmp_select_no_fold_i16_other_pos(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1092
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x1092
+; GFX900-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 4242
+  %sel = select i1 %cmp, i16 %other, i16 4242
+  ret i16 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i16 @icmp_select_no_fold_i16_unsupported_cmp(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x1093
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x1092
+; GFX900-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x1093, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i16 %arg, 4242
+  %sel = select i1 %cmp, i16 4242, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i16 @icmp_select_no_fold_i16_enc_imm(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 %arg, 0
+  %sel = select i1 %cmp, i16 0, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i16 @icmp_select_no_fold_i16_enc_imm_2(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, 45, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 45, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 45, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 45, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i16 45, %arg
+  %sel = select i1 %cmp, i16 45, i16 %other
+  ret i16 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i16 @icmp_select_no_fold_i16_enc_imm_3(i16 %arg, i16 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_cmp_ne_u16_e32 vcc, -12, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -12, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_cmp_ne_u16_e32 vcc_lo, -12, v0
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -12, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i16 %arg, -12
+  %sel = select i1 %cmp, i16 %other, i16 -12
+  ret i16 %sel
+}
+
+;------------------------------------------------------------------------------
+; I8 Tests
+;------------------------------------------------------------------------------
+
+; Should be folded: icmp eq + select with constant in true value
+define i8 @icmp_select_fold_eq_i8_imm(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_i8_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_i8_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 123
+  %sel = select i1 %cmp, i8 123, i8 %other
+  ret i8 %sel
+}
+
+; Should be folded: icmp eq + select with constant in true value (commutative)
+define i8 @icmp_select_fold_eq_imm_i8(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_eq_imm_i8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_eq_imm_i8:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 123, %arg
+  %sel = select i1 %cmp, i8 123, i8 %other
+  ret i8 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value
+define i8 @icmp_select_fold_ne_i8_imm(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_i8_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_i8_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i8 %arg, 123
+  %sel = select i1 %cmp, i8 %other, i8 123
+  ret i8 %sel
+}
+
+; Should be folded: icmp ne + select with constant in false value (commutative)
+define i8 @icmp_select_fold_ne_imm_i8(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_fold_ne_imm_i8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_fold_ne_imm_i8:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i8 123, %arg
+  %sel = select i1 %cmp, i8 %other, i8 123
+  ret i8 %sel
+}
+
+; Should NOT be folded: icmp eq with different constants
+define i8 @icmp_select_no_fold_i8_different(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_different:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7c
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_different:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7c, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 123
+  %sel = select i1 %cmp, i8 124, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: icmp eq with constant in other position
+define i8 @icmp_select_no_fold_i8_other_pos(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_other_pos:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7b
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-NEXT:    v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_other_pos:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX1010-NEXT:    v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7b, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 123
+  %sel = select i1 %cmp, i8 %other, i8 123
+  ret i8 %sel
+}
+
+; Should NOT be folded: unsupported comparison type
+define i8 @icmp_select_no_fold_i8_unsupported_cmp(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_unsupported_cmp:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0x7c
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-NEXT:    v_cmp_lt_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_unsupported_cmp:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0x7c
+; GFX1010-NEXT:    v_cmp_lt_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0x7b, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ugt i8 %arg, 123
+  %sel = select i1 %cmp, i8 123, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i8 @icmp_select_no_fold_i8_enc_imm(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 %arg, 0
+  %sel = select i1 %cmp, i8 0, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i8 @icmp_select_no_fold_i8_enc_imm_2(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm_2:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 25
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, 25, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm_2:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 25
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, 25, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp eq i8 25, %arg
+  %sel = select i1 %cmp, i8 25, i8 %other
+  ret i8 %sel
+}
+
+; Should NOT be folded: immediate can be encoded into cndmask
+define i8 @icmp_select_no_fold_i8_enc_imm_3(i8 %arg, i8 %other) {
+; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm_3:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_movk_i32 s4, 0xfb
+; GFX900-NEXT:    v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, -5, v1, vcc
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm_3:
+; GFX1010:       ; %bb.0: ; %entry
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-NEXT:    v_mov_b32_e32 v2, 0xfb
+; GFX1010-NEXT:    v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX1010-NEXT:    v_cndmask_b32_e32 v0, -5, v1, vcc_lo
+; GFX1010-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %cmp = icmp ne i8 %arg, -5
+  %sel = select i1 %cmp, i8 %other, i8 -5
+  ret i8 %sel
+}
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index ec3781fbf0fc4..f497752994852 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -841,3 +841,23 @@ ret:
   ret void
 }
 
+define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b) {
+; GCN-LABEL: poison_should_freeze:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x5040100
+; GCN-NEXT:    v_perm_b32 v2, v2, s4, v7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %poisonv = insertelement <2 x i16> poison, i16 %val2, i32 1
+  %poison = bitcast <2 x i16> %poisonv to i32
+  %cond2 = select i1 %cond1, i32 %poison, i32 %val
+  %cmp = icmp eq i32 %cond2, 0
+  %select = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %select
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
new file mode 100644
index 0000000000000..192bd2073886a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s
+
+define amdgpu_kernel void @copy_to_vreg_1(i32 %0) {
+; GCN-LABEL: copy_to_vreg_1:
+; GCN:       ; %bb.0: ; %._crit_edge
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s5, 1, s4
+; GCN-NEXT:    s_cmp_lt_u32 s4, 2
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s3, s5, 1
+; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GCN-NEXT:    s_addc_u32 s0, 1, 0
+; GCN-NEXT:    v_readfirstlane_b32 s2, v1
+; GCN-NEXT:    s_cmp_ge_u32 s3, s4
+; GCN-NEXT:    s_cselect_b32 s4, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_cmp_lg_u64 0, 0
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_branch .LBB0_3
+; GCN-NEXT:  .LBB0_1: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_xor_b64 s[8:9], exec, -1
+; GCN-NEXT:  .LBB0_2: ; %Flow3
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_and_b64 s[4:5], exec, s[8:9]
+; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:    s_cbranch_execz .LBB0_8
+; GCN-NEXT:  .LBB0_3: ; %.lr.ph27
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
+; GCN-NEXT:    s_xor_b64 s[6:7], s[8:9], -1
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], s[8:9]
+; GCN-NEXT:    s_cbranch_execz .LBB0_5
+; GCN-NEXT:  ; %bb.4: ; %pred.store.if
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    global_store_byte v[2:3], v1, off
+; GCN-NEXT:  .LBB0_5: ; %Flow2
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_mov_b64 s[8:9], -1
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB0_2
+; GCN-NEXT:  ; %bb.6: ; %pred.store.continue
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GCN-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB0_1
+; GCN-NEXT:  ; %bb.7: ; %pred.store.if41
+; GCN-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; GCN-NEXT:    global_store_byte v[2:3], v1, off
+; GCN-NEXT:    s_branch .LBB0_1
+; GCN-NEXT:  .LBB0_8: ; %DummyReturnBlock
+; GCN-NEXT:    s_endpgm
+._crit_edge:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = udiv i32 1, %0
+  br label %.lr.ph27
+
+.lr.ph27:                                         ; preds = %pred.store.if41, %pred.store.continue, %._crit_edge
+  %iv = phi i32 [ %div, %._crit_edge ], [ 0, %pred.store.if41 ], [ 0, %pred.store.continue ]
+  %cmp = icmp ugt i32 %iv, 0
+  %broadcast.splatinsert37 = insertelement <4 x i1> zeroinitializer, i1 %cmp, i64 0
+  %.zext = zext i32 %id.x to i64
+  %broadcast.splatinsert39 = insertelement <4 x i64> zeroinitializer, i64 %.zext, i64 0
+  %cmp.1 = icmp uge <4 x i64> %broadcast.splatinsert39, splat (i64 1)
+  %or = or <4 x i1> %cmp.1, %broadcast.splatinsert37
+  %extract = extractelement <4 x i1> %or, i64 0
+  br i1 %extract, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:                                    ; preds = %.lr.ph27
+  store i8 0, ptr addrspace(1) null, align 64
+  br label %pred.store.continue
+
+pred.store.continue:                              ; preds = %pred.store.if, %.lr.ph27
+  %extract.1 = extractelement <4 x i1> %or, i64 1
+  br i1 %extract.1, label %pred.store.if41, label %.lr.ph27
+
+pred.store.if41:                                  ; preds = %pred.store.continue
+  store i8 0, ptr addrspace(1) null, align 64
+  br label %.lr.ph27
+}
+
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
new file mode 100644
index 0000000000000..2daea2b2eeb74
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
@@ -0,0 +1,31 @@
+# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            copy_to_vreg_1
+tracksRegLiveness: true
+body:             |
+  ; GCN-LABEL: name: copy_to_vreg_1
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec
+  ; GCN-NEXT:   [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]]
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+      %0:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
+      %1:sreg_32 = COPY %0:vgpr_32
+      %2:sreg_32 = COPY $vgpr1
+      samesign S_CMP_GT_U32 %1:sreg_32, killed %2:sreg_32, implicit-def $scc
+      %3:sreg_64 = COPY $scc
+      %4:vreg_1 = COPY %3:sreg_64
+
+  bb.1:
+      S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-fma-f64.mir b/llvm/test/CodeGen/AMDGPU/shrink-fma-f64.mir
new file mode 100644
index 0000000000000..be46831d3bfe5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-fma-f64.mir
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-shrink-instructions %s -o - | FileCheck %s -check-prefix=GFX1250
+
+---
+name: fma_cvv_f64
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: fma_cvv_f64
+    ; GFX1250: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr4_vgpr5 = V_FMAMK_F64 $vgpr0_vgpr1, 4638355772470722560, $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN implicit $vgpr4_vgpr5
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, 4638355772470722560, 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN implicit $vgpr4_vgpr5
+...
+
+---
+name: fma_vcv_f64
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: fma_vcv_f64
+    ; GFX1250: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr4_vgpr5 = V_FMAMK_F64 $vgpr0_vgpr1, 4638355772470722560, $vgpr2_vgpr3, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN implicit $vgpr4_vgpr5
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, 4638355772470722560, 0, $vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN implicit $vgpr4_vgpr5
+...
+
+---
+name: fma_vvc_f64
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: fma_vvc_f64
+    ; GFX1250: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr4_vgpr5 = V_FMAAK_F64 $vgpr0_vgpr1, $vgpr2_vgpr3, 4638355772470722560, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN implicit $vgpr4_vgpr5
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 4638355772470722560, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN implicit $vgpr4_vgpr5
+...
+
+---
+name: fma_vsc_f64
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: fma_vsc_f64
+    ; GFX1250: $vgpr0_vgpr1 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX1250-NEXT: $vgpr4_vgpr5 = V_FMAAK_F64 $vgpr0_vgpr1, $vgpr2_vgpr3, 4638355772470722560, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: SI_RETURN implicit $vgpr4_vgpr5
+    $vgpr0_vgpr1 = IMPLICIT_DEF
+    $vgpr2_vgpr3 = IMPLICIT_DEF
+    $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, 4638355772470722560, 0, 0, implicit $mode, implicit $exec
+    SI_RETURN implicit $vgpr4_vgpr5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
index 8e6da4bf92ee0..3f6956b83ae92 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
@@ -18,9 +18,9 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -29,8 +29,8 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; GFX908-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr1 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32
@@ -62,9 +62,9 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -73,8 +73,8 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; GFX90A-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr1 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32
@@ -124,7 +124,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -133,7 +133,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr64
@@ -164,7 +164,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -173,7 +173,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr64
@@ -222,14 +222,14 @@ body: |
   ; GFX908-SPILLED-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
   ; GFX908-SPILLED-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -288,14 +288,14 @@ body: |
   ; GFX90A-SPILLED-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.2(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -385,7 +385,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -394,7 +394,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_AV96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr96
@@ -427,7 +427,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -436,7 +436,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_AV96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr96
@@ -486,7 +486,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -495,7 +495,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr128
@@ -530,7 +530,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -539,7 +539,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr128
@@ -591,7 +591,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -600,7 +600,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_AV160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr160
@@ -637,7 +637,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -646,7 +646,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_AV160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr160
@@ -700,7 +700,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -709,7 +709,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr192
@@ -748,7 +748,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -757,7 +757,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr192
@@ -813,7 +813,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -822,7 +822,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_AV256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr256
@@ -865,7 +865,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -874,7 +874,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_AV256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr256
@@ -934,7 +934,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -943,7 +943,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_AV288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr288
@@ -988,7 +988,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -997,7 +997,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_AV288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr288
@@ -1059,7 +1059,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1068,7 +1068,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_AV320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr320
@@ -1115,7 +1115,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1124,7 +1124,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_AV320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr320
@@ -1188,7 +1188,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1197,7 +1197,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_AV352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr352
@@ -1246,7 +1246,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1255,7 +1255,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_AV352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr352
@@ -1321,7 +1321,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1330,7 +1330,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_AV384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr384
@@ -1381,7 +1381,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1390,7 +1390,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_AV384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr384
@@ -1458,7 +1458,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1467,7 +1467,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr512
@@ -1526,7 +1526,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1535,7 +1535,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr512
@@ -1611,7 +1611,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-  ; GFX908-SPILLED-NEXT:   SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   SI_SPILL_AV1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.1:
@@ -1620,7 +1620,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 1
   ; GFX908-SPILLED-NEXT: {{  $}}
   ; GFX908-SPILLED-NEXT: bb.2:
-  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_AV1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
   ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr1024
@@ -1711,7 +1711,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
-  ; GFX90A-SPILLED-NEXT:   SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   SI_SPILL_AV1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.1:
@@ -1720,7 +1720,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 1
   ; GFX90A-SPILLED-NEXT: {{  $}}
   ; GFX90A-SPILLED-NEXT: bb.2:
-  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_AV1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
   ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr1024
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index bd255e88b9512..648b59f69ea79 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -9,9 +9,9 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
-  ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
+  ; GCN-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
+  ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 6da7d1b7ee868..a6b8ea3963b38 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TAHITI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; TAHITI-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; TAHITI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; TAHITI-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v0
+; TAHITI-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; TAHITI-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
@@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_mul_hi_u32 v8, v14, v8
 ; TONGA-NEXT:    v_mul_lo_u32 v8, v8, v10
 ; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v14, v8
-; TONGA-NEXT:    v_subrev_u32_e32 v9, vcc, v10, v8
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v8, v10
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v10
 ; TONGA-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v8, v10
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 47dfa9f4fc2d3..33c2ce628e108 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -921,45 +921,47 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[10:11], s[2:3], 31
-; GCN-NEXT:    s_ashr_i64 s[6:7], s[4:5], 31
-; GCN-NEXT:    s_ashr_i32 s4, s5, 31
-; GCN-NEXT:    s_add_u32 s6, s6, s4
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    s_addc_u32 s7, s7, s4
-; GCN-NEXT:    s_xor_b64 s[8:9], s[6:7], s[4:5]
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 31
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 31
+; GCN-NEXT:    s_ashr_i32 s6, s5, 31
+; GCN-NEXT:    s_add_u32 s4, s4, s6
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    s_addc_u32 s5, s5, s6
+; GCN-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s2, 0, s8
-; GCN-NEXT:    s_subb_u32 s4, 0, s9
-; GCN-NEXT:    s_ashr_i32 s12, s3, 31
+; GCN-NEXT:    s_sub_u32 s4, 0, s8
+; GCN-NEXT:    s_subb_u32 s5, 0, s9
+; GCN-NEXT:    s_ashr_i32 s10, s3, 31
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s13, s12
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_add_u32 s2, s2, s10
+; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    s_addc_u32 s3, s3, s10
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
@@ -967,12 +969,12 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
@@ -988,20 +990,18 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT:    s_add_u32 s2, s10, s12
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    s_addc_u32 s3, s11, s12
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s12, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s13, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s13, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s13, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
@@ -1013,9 +1013,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s13, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
 ; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
@@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_mov_b32_e32 v4, s13
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
@@ -1042,10 +1042,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s12, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s10, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, s10, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
index 831570800d06c..6966c3d8b6d6a 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
+++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
@@ -34,26 +34,26 @@ body:             |
   ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_AV128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_AV32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr0 = SI_SPILL_AV32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr9 = COPY killed renamable $vgpr5
-  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9
   ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir
new file mode 100644
index 0000000000000..fa27d689dd8dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1200 %s
+
+---
+name:            trans_writes_valu_reads_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_writes_valu_reads_hazard
+    ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_writes_valu_reads_hazard
+    ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_writes_valu_valu_reads_hazard_covered
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_writes_valu_valu_reads_hazard_covered
+    ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_writes_salu_valu_reads_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_writes_salu_valu_reads_hazard
+    ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_writes_salu_valu_reads_hazard
+    ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1200-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_no_hazard
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_no_hazard
+    ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads_valu_writes_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_reads_valu_writes_hazard
+    ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_reads_valu_writes_hazard
+    ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads_valu_valu_writes_hazard_covered
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_reads_valu_valu_writes_hazard_covered
+    ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec
+    $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads__salu_valu_writes_hazard
+body:            |
+  bb.0:
+    ; GFX1250-LABEL: name: trans_reads__salu_valu_writes_hazard
+    ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+    ;
+    ; GFX1200-LABEL: name: trans_reads__salu_valu_writes_hazard
+    ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc
+    $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_writes_trans_reads_no_hazard
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_writes_trans_reads_no_hazard
+    ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec
+...
+
+---
+name:            trans_reads_trans_writes_no_hazard
+body:            |
+  bb.0:
+    ; GCN-LABEL: name: trans_reads_trans_writes_no_hazard
+    ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+    $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec
+    $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
index b0a75a526cf2b..316c34ebabcd2 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir
@@ -1,8 +1,10 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefixes=GCN,GFX90A %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx90a %s --passes=two-address-instruction -verify-each -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 %s -run-pass twoaddressinstruction -o - | FileCheck -check-prefixes=GCN,GFX1250 %s
 
 # GCN-LABEL: name: test_fmamk_reg_imm_f64
-# GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAMK_F64 killed %0, 4607182418800017408, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_reg_imm_f64
 registers:
@@ -21,7 +23,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmamk_imm_reg_f64
-# GCN: V_FMA_F64_e64 0, %2, 0, killed %0.sub0_sub1, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, %2, 0, killed %0.sub0_sub1, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAMK_F64 killed %0.sub0_sub1, 4607182418800017408, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_imm_reg_f64
 registers:
@@ -40,7 +43,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmaak_f64
-# GCN: V_FMA_F64_e64 0, killed %0.sub0_sub1, 0, %0.sub2_sub3, 0, %1, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, killed %0.sub0_sub1, 0, %0.sub2_sub3, 0, %1, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAAK_F64 killed %0.sub0_sub1, %0.sub2_sub3, 4607182418800017408, implicit $mode, implicit $exec
 ---
 name:            test_fmaak_f64
 registers:
@@ -57,7 +61,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmaak_sgpr_src0_f64
-# GCN: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAMK_F64 killed %0, 4607182418800017408, %2, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_sgpr_src0_f64
@@ -77,7 +82,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmaak_inlineimm_src0_f64
-# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAMK_F64 4611686018427387904, 4607182418800017408, %1, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_inlineimm_src0_f64
@@ -95,7 +101,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmaak_otherimm_src0_f64
-# GCN: V_FMAC_F64_e32 4636737291354636288, %0, %2, implicit $mode, implicit $exec
+# GFX90A: V_FMAC_F64_e32 4636737291354636288, %0, %2, implicit $mode, implicit $exec
+# GFX1250: V_FMAMK_F64 %0, 4636737291354636288, %1, implicit $mode, implicit $exec
 
 ---
 name:            test_fmaak_otherimm_src0_f64
@@ -134,7 +141,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmamk_reg_unfoldable_literal_src0_f64
-# GCN: V_FMA_F64_e64 0, %2, 0, killed %0, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, %2, 0, killed %0, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAMK_F64 killed %0, 123456, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_reg_unfoldable_literal_src0_f64
 registers:
@@ -153,7 +161,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmamk_reg_unfoldable_literal_src1_f64
-# GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAMK_F64 killed %0, 123456, killed %1, implicit $mode, implicit $exec
 ---
 name:            test_fmamk_reg_unfoldable_literal_src1_f64
 registers:
@@ -172,7 +181,8 @@ body:             |
 ...
 
 # GCN-LABEL: name: test_fmaak_reg_unfoldable_literal_src2_f64
-# GCN: V_FMA_F64_e64 0, killed %0, 0, killed %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+# GFX90A: V_FMA_F64_e64 0, killed %0, 0, killed %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+# GFX1250: V_FMAAK_F64 killed %0, killed %1, 123456, implicit $mode, implicit $exec
 ---
 name:            test_fmaak_reg_unfoldable_literal_src2_f64
 registers:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index f0829b53168d9..c12265bd7f372 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index e67420562e257..5056747c33cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index 92993d07b4f8f..184c80765430c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -3805,37 +3805,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3909,37 +3909,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4013,47 +4013,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -4123,49 +4125,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4227,50 +4229,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4334,58 +4335,58 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index 2bcee373d9247..e3a7ae5fd0256 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -3544,37 +3544,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX7-SDAG:       ; %bb.0: ; %entry
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX7-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3648,37 +3648,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX8-SDAG:       ; %bb.0: ; %entry
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25]
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v11, v27, v11, s[4:5]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v10, v26, v10, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21]
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23]
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[6:7]
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GFX8-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX8-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3752,47 +3752,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    scratch_load_dword v31, off, s32
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[24:25]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[0:1], v[16:17]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[8:9], v[12:13], v[28:29]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[10:11], v[4:5], v[20:21]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[2:3], v[18:19]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[8:9]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s[10:11]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[8:9]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[6:7], v[22:23]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[0:1]
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[0:1], v[16:17]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[12:13], v[28:29]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[4:5], v[20:21]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], v[6:7], v[22:23]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[8:9], v[10:11], v[26:27]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[10:11], v[2:3], v[18:19]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, v5, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v23, v7, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v27, v11, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[10:11]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v16, v20, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v22, v6, s[6:7]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v26, v10, s[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v17, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v16, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31]
 ; GFX9-SDAG-NEXT:    s_nop 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v31, v15, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v30, v14, vcc
-; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[4:5]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[2:3]
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v7, v31, v15, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v30, v14, vcc
+; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-SDAG-NEXT:    s_nop 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX9-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-SDAG-NEXT:    s_nop 1
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
@@ -3862,49 +3864,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[0:1], v[16:17]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s6, v[6:7], v[22:23]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[18:19]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[6:7], v[22:23]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s6, v[8:9], v[24:25]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s7, v[0:1], v[16:17]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s8, v[12:13], v[28:29]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s5
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s6
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[20:21]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s7
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s8
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s7
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s6, v[0:1], v[8:9]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s6
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s6
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[14:15], v[30:31]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v14, v30, v14, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s6
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[0:1], v[8:9]
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[10:11]
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s5
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s4
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[0:1], v[4:5]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s5
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s4
-; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[6:7], v[14:15]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s4
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s4
 ; GFX10-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -3966,50 +3968,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[16:17]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[6:7], v[22:23]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[18:19]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[22:23]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[8:9], v[24:25]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s3, v[0:1], v[16:17]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[12:13], v[28:29]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[4:5], v[20:21]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[10:11]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[14:15]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX11-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -4073,58 +4074,58 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) {
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    scratch_load_b32 v31, off, s32
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[16:17]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[6:7], v[22:23]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[18:19]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[22:23]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[8:9], v[24:25]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s3, v[0:1], v[16:17]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s4, v[12:13], v[28:29]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s1
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[4:5], v[20:21]
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s2
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s5, v[4:5], v[20:21]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v23, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v9, v25, v9, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s3
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v13, v29, v13, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s5
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v12, v28, v12, s4
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s5
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v8, v24, v8, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s3
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v5, v21, v5, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v4, v20, v4, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v22, v6, s2
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[10:11]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s2, v[0:1], v[8:9]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s1
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s0
 ; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s2
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s2
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s1
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s1, v[6:7], v[14:15]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[6:7], v[14:15]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s1
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[0:1], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
+; GFX12-SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s0
 ; GFX12-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
 ; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
diff --git a/llvm/test/CodeGen/ARM/fast-isel-align.ll b/llvm/test/CodeGen/ARM/fast-isel-align.ll
index e7741155e6bf1..8adbc3039d4e1 100644
--- a/llvm/test/CodeGen/ARM/fast-isel-align.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel-align.ll
@@ -8,9 +8,6 @@
 ; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
 ; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
-; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -mattr=+strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-
 ; RUN: llc < %s -O0 -mattr=+strict-align -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
 ; RUN: llc < %s -O0  -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -mattr=+strict-align -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 ; RUN: llc < %s -O0 -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM
diff --git a/llvm/test/CodeGen/ARM/llvm.frexp.ll b/llvm/test/CodeGen/ARM/llvm.frexp.ll
index 43edb17fe1081..376426d701b3e 100644
--- a/llvm/test/CodeGen/ARM/llvm.frexp.ll
+++ b/llvm/test/CodeGen/ARM/llvm.frexp.ll
@@ -544,6 +544,59 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
   ret <2 x i32> %result.1
 }
 
+define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind {
+; CHECK-LABEL: test_frexp_f128_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    mov r12, r3
+; CHECK-NEXT:    ldr r3, [sp, #16]
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    add r0, sp, #4
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    bl frexpl
+; CHECK-NEXT:    ldr.w r12, [sp, #4]
+; CHECK-NEXT:    stm.w r4, {r0, r1, r2, r3, r12}
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r4, pc}
+  %result = call { fp128, i32 } @llvm.frexp.f128.i32(fp128 %a)
+  ret { fp128, i32 } %result
+}
+
+define fp128 @test_frexp_f128_i32_only_use_fract(fp128 %a) nounwind {
+; CHECK-LABEL: test_frexp_f128_i32_only_use_fract:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    add.w r12, sp, #4
+; CHECK-NEXT:    str.w r12, [sp]
+; CHECK-NEXT:    bl frexpl
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r7, pc}
+  %result = call { fp128, i32 } @llvm.frexp.f128.i32(fp128 %a)
+  %result.0 = extractvalue { fp128, i32 } %result, 0
+  ret fp128 %result.0
+}
+
+define i32 @test_frexp_f128_i32_only_use_exp(fp128 %a) nounwind {
+; CHECK-LABEL: test_frexp_f128_i32_only_use_exp:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    add.w r12, sp, #4
+; CHECK-NEXT:    str.w r12, [sp]
+; CHECK-NEXT:    bl frexpl
+; CHECK-NEXT:    ldr r0, [sp, #4]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r7, pc}
+  %result = call { fp128, i32 } @llvm.frexp.f128.i32(fp128 %a)
+  %result.0 = extractvalue { fp128, i32 } %result, 1
+  ret i32 %result.0
+}
+
 declare { float, i32 } @llvm.frexp.f32.i32(float) #0
 declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>) #0
 declare { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float>) #0
diff --git a/llvm/test/CodeGen/ARM/stack-protector-target.ll b/llvm/test/CodeGen/ARM/stack-protector-target.ll
new file mode 100644
index 0000000000000..a7ec0ec69bd2c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/stack-protector-target.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=arm-unknown-linux-eabi < %s | FileCheck -check-prefix=LINUX %s
+; RUN: llc -mtriple=arm-unknown-linux-gnueabi < %s | FileCheck -check-prefix=LINUX %s
+; RUN: llc -mtriple=arm-unknown-openbsd < %s | FileCheck -check-prefix=OPENBSD %s
+
+define void @func() sspreq nounwind {
+; LINUX-LABEL: func:
+; LINUX:       @ %bb.0:
+; LINUX-NEXT:    .save {r11, lr}
+; LINUX-NEXT:    push {r11, lr}
+; LINUX-NEXT:    .pad #8
+; LINUX-NEXT:    sub sp, sp, #8
+; LINUX-NEXT:    ldr r0, .LCPI0_0
+; LINUX-NEXT:    ldr r0, [r0]
+; LINUX-NEXT:    str r0, [sp, #4]
+; LINUX-NEXT:    mov r0, sp
+; LINUX-NEXT:    bl capture
+; LINUX-NEXT:    ldr r0, [sp, #4]
+; LINUX-NEXT:    ldr r1, .LCPI0_0
+; LINUX-NEXT:    ldr r1, [r1]
+; LINUX-NEXT:    cmp r1, r0
+; LINUX-NEXT:    addeq sp, sp, #8
+; LINUX-NEXT:    popeq {r11, lr}
+; LINUX-NEXT:    moveq pc, lr
+; LINUX-NEXT:  .LBB0_1:
+; LINUX-NEXT:    bl __stack_chk_fail
+; LINUX-NEXT:    .p2align 2
+; LINUX-NEXT:  @ %bb.2:
+; LINUX-NEXT:  .LCPI0_0:
+; LINUX-NEXT:    .long __stack_chk_guard
+;
+; OPENBSD-LABEL: func:
+; OPENBSD:       @ %bb.0:
+; OPENBSD-NEXT:    push {r4, lr}
+; OPENBSD-NEXT:    sub sp, sp, #8
+; OPENBSD-NEXT:    ldr r4, .LCPI0_0
+; OPENBSD-NEXT:    ldr r0, [r4]
+; OPENBSD-NEXT:    ldr r0, .LCPI0_2
+; OPENBSD-NEXT:    ldr r0, [r0]
+; OPENBSD-NEXT:    str r0, [sp, #4]
+; OPENBSD-NEXT:    mov r0, sp
+; OPENBSD-NEXT:    bl capture
+; OPENBSD-NEXT:    ldr r0, [r4]
+; OPENBSD-NEXT:    ldr r1, [sp, #4]
+; OPENBSD-NEXT:    cmp r0, r1
+; OPENBSD-NEXT:    addeq sp, sp, #8
+; OPENBSD-NEXT:    popeq {r4, lr}
+; OPENBSD-NEXT:    moveq pc, lr
+; OPENBSD-NEXT:  .LBB0_1: @ %CallStackCheckFailBlk
+; OPENBSD-NEXT:    ldr r0, .LCPI0_1
+; OPENBSD-NEXT:    bl __stack_smash_handler
+; OPENBSD-NEXT:    .p2align 2
+; OPENBSD-NEXT:  @ %bb.2:
+; OPENBSD-NEXT:  .LCPI0_0:
+; OPENBSD-NEXT:    .long __guard_local
+; OPENBSD-NEXT:  .LCPI0_1:
+; OPENBSD-NEXT:    .long .LSSH
+; OPENBSD-NEXT:  .LCPI0_2:
+; OPENBSD-NEXT:    .long __guard_local
+  %alloca = alloca i32, align 4
+  call void @capture(ptr %alloca)
+  ret void
+}
+
+declare void @capture(ptr)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; EABI: {{.*}}
+; GNUEABI: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/struct_byval.ll b/llvm/test/CodeGen/ARM/struct_byval.ll
index 2bc4f9c816d53..f8059d08881ab 100644
--- a/llvm/test/CodeGen/ARM/struct_byval.ll
+++ b/llvm/test/CodeGen/ARM/struct_byval.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios6.0 | FileCheck %s
-; RUN: llc < %s -mtriple=armv7-unknown-nacl-gnueabi | FileCheck %s -check-prefix=NACL
 ; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi | FileCheck %s -check-prefix=NOMOVT
 
 ; NOMOVT-NOT: movt
@@ -28,14 +27,6 @@ entry:
 ; CHECK: sub
 ; CHECK: str
 ; CHECK: bne
-; NACL-LABEL: g:
-; Ensure that use movw instead of constpool for the loop trip count. But don't
-; match the __stack_chk_guard movw
-; NACL: movw {{r[0-9]+|lr}}, #
-; NACL: ldr
-; NACL: sub
-; NACL: str
-; NACL: bne
   %st = alloca %struct.LargeStruct, align 4
   %call = call i32 @e2(ptr byval(%struct.LargeStruct) %st)
   ret i32 0
@@ -49,11 +40,6 @@ entry:
 ; CHECK: sub
 ; CHECK: vst1
 ; CHECK: bne
-; NACL: movw {{r[0-9]+|lr}}, #
-; NACL: vld1
-; NACL: sub
-; NACL: vst1
-; NACL: bne
   %st = alloca %struct.LargeStruct, align 16
   %call = call i32 @e3(ptr byval(%struct.LargeStruct) align 16 %st)
   ret i32 0
diff --git a/llvm/test/CodeGen/ARM/trap.ll b/llvm/test/CodeGen/ARM/trap.ll
index a5d2b2081f28f..88b69a4ed7b89 100644
--- a/llvm/test/CodeGen/ARM/trap.ll
+++ b/llvm/test/CodeGen/ARM/trap.ll
@@ -4,17 +4,9 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
 ; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap | FileCheck %s -check-prefix=FUNC
 ; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap -O0 | FileCheck %s -check-prefix=FUNC
-; RUN: llc < %s -mtriple=armv7 -mattr=+nacl-trap | FileCheck %s -check-prefix=NACL
 ; RUN: llc < %s -mtriple=armv7 | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -mtriple=thumbv7 | FileCheck %s -check-prefix=THUMB
 
-; RUN: llc -mtriple=armv7 -mattr=+nacl-trap -filetype=obj %s -o - \
-; RUN:  | llvm-objdump -d --triple=armv7 --mattr=+nacl-trap - \
-; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
-; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7 -mattr=+nacl-trap -filetype=obj %s -o - \
-; RUN:  | llvm-objdump -d --triple=armv7 --mattr=+nacl-trap - \
-; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
-
 ; RUN: llc -mtriple=armv7 -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -d --triple=armv7 - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-ARM
@@ -46,10 +38,6 @@ entry:
 ; FUNC:      bl __trap
 ; FUNC-NEXT: add sp, sp, #4
 
-; NACL-LABEL: t:
-; NACL:      .inst 0xe7fedef0
-; NACL-NEXT: add sp, sp, #4
-
 ; ARM-LABEL: t:
 ; ARM:      .inst 0xe7ffdefe
 ; ARM-NEXT: add sp, sp, #4
@@ -58,8 +46,6 @@ entry:
 ; THUMB:      .inst.n 0xdefe
 ; THUMB-NEXT: add sp, #4
 
-; ENCODING-NACL: e7fedef0    trap
-
 ; ENCODING-ARM: e7ffdefe    trap
 
 ; ENCODING-THUMB: defe  trap
@@ -82,10 +68,6 @@ entry:
 ; FUNC:      bl __trap
 ; FUNC-NEXT: add sp, sp, #4
 
-; NACL-LABEL: t2:
-; NACL:      bkpt #0
-; NACL-NEXT: add sp, sp, #4
-
 ; ARM-LABEL: t2:
 ; ARM:      bkpt #0
 ; ARM-NEXT: add sp, sp, #4
@@ -94,8 +76,6 @@ entry:
 ; THUMB:      bkpt #0
 ; THUMB-NEXT: add sp, #4
 
-; ENCODING-NACL: e1200070    bkpt #0
-
 ; ENCODING-ARM: e1200070    bkpt #0
 
 ; ENCODING-THUMB: be00  bkpt #0
diff --git a/llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll b/llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
deleted file mode 100644
index 1dda1fed366b0..0000000000000
--- a/llvm/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -mtriple=arm-nacl-gnueabi | FileCheck %s
-
-declare void @llvm.va_start(ptr)
-declare void @external_func(ptr)
-
-@va_list = external global ptr
-
-; On ARM, varargs arguments are passed in r0-r3 with the rest on the
-; stack.  A varargs function must therefore spill rN-r3 just below the
-; function's initial stack pointer.
-;
-; This test checks for a bug in which a gap was left between the spill
-; area and varargs arguments on the stack when using 16 byte stack
-; alignment.
-
-define void @varargs_func(i32 %arg1, ...) {
-  call void @llvm.va_start(ptr @va_list)
-  call void @external_func(ptr @va_list)
-  ret void
-}
-; CHECK-LABEL: varargs_func:
-; Reserve space for the varargs save area.  This currently reserves
-; more than enough (16 bytes rather than the 12 bytes needed).
-; CHECK: sub sp, sp, #12
-; CHECK: push {r11, lr}
-; Align the stack pointer to a multiple of 16.
-; CHECK: sub sp, sp, #12
-; Calculate the address of the varargs save area and save varargs
-; arguments into it.
-; CHECK-NEXT: add r0, sp, #20
-; CHECK-NEXT: stm r0, {r1, r2, r3}
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index dc8c5f8421bfe..a2e105537ab88 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -123,8 +123,8 @@ define void @gep_4d_test ()  {
 @b = internal global [2 x [3 x [4 x i32]]] zeroinitializer, align 16
 
 define void @global_gep_load() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 6
-  ; CHECK-NEXT: load i32, ptr [[GEP_PTR]], align 4
+  ; CHECK-LABEL: define void @global_gep_load(
+  ; CHECK: {{.*}} = load i32, ptr getelementptr inbounds ([24 x i32], ptr @a.1dim, i32 0, i32 6), align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @a, i32 0, i32 0
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 1
@@ -133,6 +133,14 @@ define void @global_gep_load() {
   ret void
 }
 
+define void @global_nested_geps() {
+  ; CHECK-LABEL: define void @global_nested_geps(
+  ; CHECK: {{.*}} = load i32, ptr getelementptr inbounds ([24 x i32], ptr @a.1dim, i32 0, i32 6), align 4
+  ; CHECK-NEXT:    ret void
+  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* getelementptr inbounds ([3 x [4 x i32]], [3 x [4 x i32]]* getelementptr inbounds ([2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @a, i32 0, i32 0), i32 0, i32 1), i32 0, i32 2), align 4
+  ret void
+}
+
 define void @global_gep_load_index(i32 %row, i32 %col, i32 %timeIndex) {
 ; CHECK-LABEL: define void @global_gep_load_index(
 ; CHECK-SAME: i32 [[ROW:%.*]], i32 [[COL:%.*]], i32 [[TIMEINDEX:%.*]]) {
@@ -159,9 +167,9 @@ define void @global_gep_load_index(i32 %row, i32 %col, i32 %timeIndex) {
 define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 ; CHECK-LABEL: define void @global_incomplete_gep_chain(
 ; CHECK-SAME: i32 [[ROW:%.*]], i32 [[COL:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[COL]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[COL]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 0, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[ROW]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[ROW]], 12
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP4]]
 ; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
@@ -177,8 +185,7 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 }
 
 define void @global_gep_store() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @b.1dim, i32 0, i32 13
-  ; CHECK-NEXT: store i32 1, ptr [[GEP_PTR]], align 4
+  ; CHECK: store i32 1, ptr getelementptr inbounds ([24 x i32], ptr @b.1dim, i32 0, i32 13), align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @b, i32 0, i32 1
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 0
@@ -204,14 +211,35 @@ define void @two_index_gep() {
 
 define void @two_index_gep_const() {
   ; CHECK-LABEL: define void @two_index_gep_const(
-  ; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 3
-  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP_PTR]], align 4
+  ; CHECK-NEXT: load float, ptr addrspace(3) getelementptr inbounds nuw ([4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 3), align 4
   ; CHECK-NEXT: ret void
   %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 1, i32 1
   %3 = load float, ptr addrspace(3) %1, align 4
   ret void
 }
 
+define void @zero_index_global() {
+  ; CHECK-LABEL: define void @zero_index_global(
+  ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 0
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0
+  %2 = load float, ptr addrspace(3) %1, align 4
+  ret void
+}
+
+; Note: A ConstantExpr GEP with all 0 indices is equivalent to the pointer
+; operand of the GEP. Therefore the visitLoadInst will not see the pointer operand
+; as a ConstantExpr GEP and will not create a GEP instruction to be visited.
+; The later dxil-legalize pass will insert a GEP in this instance.
+define void @zero_index_global_const() {
+  ; CHECK-LABEL: define void @zero_index_global_const(
+  ; CHECK-NEXT: load float, ptr addrspace(3) @g.1dim, align 4
+  ; CHECK-NEXT: ret void
+  %1 = load float, ptr addrspace(3) getelementptr inbounds nuw ([2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0), align 4
+  ret void
+}
+
 define void @gep_4d_index_test()  {
     ; CHECK-LABEL: gep_4d_index_test
     ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
@@ -257,5 +285,47 @@ define void @gep_4d_index_and_gep_chain_mixed() {
   ret void
 }
 
+; This test demonstrates that the collapsing of GEP chains occurs regardless of
+; the source element type given to the GEP. As long as the root pointer being
+; indexed to is an aggregate data structure, the GEP will be flattened.
+define void @gep_scalar_flatten() {
+  ; CHECK-LABEL: gep_scalar_flatten
+  ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32]
+  ; CHECK-NEXT: getelementptr inbounds nuw [24 x i32], ptr [[ALLOCA]], i32 0, i32 17
+  ; CHECK-NEXT: getelementptr inbounds nuw [24 x i32], ptr [[ALLOCA]], i32 0, i32 17
+  ; CHECK-NEXT: getelementptr inbounds nuw [24 x i32], ptr [[ALLOCA]], i32 0, i32 23
+  ; CHECK-NEXT: ret void
+  %a = alloca [2 x [3 x [4 x i32]]], align 4
+  %i8root = getelementptr inbounds nuw i8, [2 x [3 x [4 x i32]]]* %a, i32 68 ; %a[1][1][1]
+  %i32root = getelementptr inbounds nuw i32, [2 x [3 x [4 x i32]]]* %a, i32 17 ; %a[1][1][1]
+  %c0 = getelementptr inbounds nuw [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* %a, i32 0, i32 1 ; %a[1]
+  %c1 = getelementptr inbounds nuw i32, [3 x [4 x i32]]* %c0, i32 8 ; %a[1][2]
+  %c2 = getelementptr inbounds nuw i8, [4 x i32]* %c1, i32 12 ; %a[1][2][3]
+  ret void
+}
+
+define void @gep_scalar_flatten_dynamic(i32 %index) {
+  ; CHECK-LABEL: gep_scalar_flatten_dynamic
+  ; CHECK-SAME: i32 [[INDEX:%.*]]) {
+  ; CHECK-NEXT:  [[ALLOCA:%.*]] = alloca [6 x i32], align 4
+  ; CHECK-NEXT:  [[I8INDEX:%.*]] = mul i32 [[INDEX]], 12
+  ; CHECK-NEXT:  [[MUL:%.*]] = mul i32 [[I8INDEX]], 1
+  ; CHECK-NEXT:  [[DIV:%.*]] = lshr i32 [[MUL]], 2
+  ; CHECK-NEXT:  [[ADD:%.*]] = add i32 0, [[DIV]]
+  ; CHECK-NEXT:  getelementptr inbounds nuw [6 x i32], ptr [[ALLOCA]], i32 0, i32 [[ADD]]
+  ; CHECK-NEXT:  [[I32INDEX:%.*]] = mul i32 [[INDEX]], 3
+  ; CHECK-NEXT:  [[MUL:%.*]] = mul i32 [[I32INDEX]], 1
+  ; CHECK-NEXT:  [[ADD:%.*]] = add i32 0, [[MUL]]
+  ; CHECK-NEXT:  getelementptr inbounds nuw [6 x i32], ptr [[ALLOCA]], i32 0, i32 [[ADD]]
+  ; CHECK-NEXT:  ret void
+  ;
+  %a = alloca [2 x [3 x i32]], align 4
+  %i8index = mul i32 %index, 12
+  %i8root = getelementptr inbounds nuw i8, [2 x [3 x i32]]* %a, i32 %i8index;
+  %i32index = mul i32 %index, 3
+  %i32root = getelementptr inbounds nuw i32, [2 x [3 x i32]]* %a, i32 %i32index;
+  ret void
+}
+
 ; Make sure we don't try to walk the body of a function declaration.
 declare void @opaque_function()
diff --git a/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll b/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
index c73e5017348d1..78971b8954150 100644
--- a/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
@@ -8,16 +8,14 @@
 define internal void @main() {
 ; CHECK-LABEL: define internal void @main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 1
-; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 2
-; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr getelementptr ([6 x float], ptr @ZerroInitArr.1dim, i32 0, i32 3), align 16
+; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr getelementptr ([6 x float], ptr @ZerroInitArr.1dim, i32 0, i32 6), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %0 = getelementptr [8 x [3 x float]], ptr @ZerroInitArr, i32 0, i32 1
+  %0 = getelementptr [2 x [3 x float]], ptr @ZerroInitArr, i32 0, i32 1
   %.i0 = load float, ptr %0, align 16
-  %1 = getelementptr [8 x [3 x float]], ptr @ZerroInitArr, i32 0, i32 2
+  %1 = getelementptr [2 x [3 x float]], ptr @ZerroInitArr, i32 0, i32 2
   %.i03 = load float, ptr %1, align 16
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/legalize-i8.ll b/llvm/test/CodeGen/DirectX/legalize-i8.ll
index f8aa2c5ecd932..7eb47ba661f4c 100644
--- a/llvm/test/CodeGen/DirectX/legalize-i8.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-i8.ll
@@ -127,10 +127,9 @@ define i32 @i8_geps_index0() {
   ; CHECK:         [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
   ; CHECK-NEXT:    ret i32 [[LOAD]]
   %1 = alloca [2 x i32], align 8
-  %2 = getelementptr inbounds nuw i8, ptr %1, i32 0
-  %3 = load i8, ptr %2
-  %4 = sext i8 %3 to i32
-  ret i32 %4
+  %2 = load i8, ptr %1
+  %3 = sext i8 %2 to i32
+  ret i32 %3
 }
 
 define i32 @i8_geps_index1() {
@@ -149,11 +148,14 @@ define i32 @i8_geps_index1() {
 define i32 @i8_gep_store() {
   ; CHECK-LABEL: define i32 @i8_gep_store(
   ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x i32], align 8
+  ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [2 x i32], ptr [[ALLOCA]], i32 0, i32 0
+  ; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
   ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [2 x i32], ptr [[ALLOCA]], i32 0, i32 1
   ; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
   ; CHECK:         [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
   ; CHECK-NEXT:    ret i32 [[LOAD]]
   %1 = alloca [2 x i32], align 8
+  store i8 0, ptr %1
   %2 = getelementptr inbounds nuw i8, ptr %1, i32 4
   store i8 1, ptr %2
   %3 = load i8, ptr %2
diff --git a/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll b/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll
new file mode 100644
index 0000000000000..c6789ac7886d5
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -passes='dxil-legalize' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+define float @load() {
+; CHECK-LABEL: define float @load
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x float], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [2 x float], ptr [[ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[GEP]], align 4
+; CHECK-NEXT:    ret float [[LOAD]]
+  %a = alloca [2 x float], align 4
+  %b = load float, ptr %a, align 4
+  ret float %b
+}
+
+define void @store() {
+; CHECK-LABEL: define void @store
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [3 x i32], ptr [[ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
+; CHECK-NEXT:    ret void
+  %a = alloca [3 x i32], align 4
+  store i32 0, ptr %a, align 4
+  ret void
+}
+
+@g = local_unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
+define void @load_whole_global () {
+; CHECK-LABEL: define void @load_whole_global
+; CHECK-NEXT:    load [4 x i32], ptr addrspace(3) @g, align 4
+; CHECK-NEXT:    ret void
+  %l = load [4 x i32], ptr addrspace(3) @g, align 4
+  ret void
+}
+
+define void @load_global_index0 () {
+; CHECK-LABEL: define void @load_global_index0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @g, i32 0, i32 0
+; CHECK-NEXT:    load i32, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    ret void
+  %l = load i32, ptr addrspace(3) @g, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/legalize-memcpy.ll b/llvm/test/CodeGen/DirectX/legalize-memcpy.ll
index 0cf9d83b3af25..55bdefc12aa77 100644
--- a/llvm/test/CodeGen/DirectX/legalize-memcpy.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-memcpy.ll
@@ -6,9 +6,9 @@ define void @replace_int_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [1 x i32], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [1 x i32], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [1 x i32], ptr [[TMP1]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [1 x i32], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[GEP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -23,17 +23,17 @@ define void @replace_3int_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [3 x i32], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 2
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP1]], i32 0, i32 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP4]], align 4
-; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 2
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr [[GEP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -48,13 +48,13 @@ define void @replace_mismatched_size_int_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x i32], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [3 x i32], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP1]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP1]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [3 x i32], ptr [[TMP2]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[GEP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -69,13 +69,13 @@ define void @replace_int16_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x i16], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x i16], align 2
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP1]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store i16 [[TMP3]], ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP1]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x i16], ptr [[TMP2]], i32 0, i32 1
 ; CHECK-NEXT:    store i16 [[TMP4]], ptr [[GEP3]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -90,13 +90,13 @@ define void @replace_float_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x float], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x float], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP1]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store float [[TMP3]], ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP1]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x float], ptr [[TMP2]], i32 0, i32 1
 ; CHECK-NEXT:    store float [[TMP4]], ptr [[GEP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -111,13 +111,13 @@ define void @replace_double_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x double], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x double], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP1]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store double [[TMP3]], ptr [[GEP1]], align 8
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP1]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[GEP2]], align 8
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x double], ptr [[TMP2]], i32 0, i32 1
 ; CHECK-NEXT:    store double [[TMP4]], ptr [[GEP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -132,13 +132,13 @@ define void @replace_half_memcpy_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca [2 x half], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [2 x half], align 2
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds half, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP1]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds half, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store half [[TMP3]], ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds half, ptr [[TMP1]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP1]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds half, ptr [[TMP2]], i32 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x half], ptr [[TMP2]], i32 0, i32 1
 ; CHECK-NEXT:    store half [[TMP4]], ptr [[GEP3]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/DirectX/legalize-memset.ll b/llvm/test/CodeGen/DirectX/legalize-memset.ll
index e97817ba824ed..a73e7378cfb9f 100644
--- a/llvm/test/CodeGen/DirectX/legalize-memset.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-memset.ll
@@ -6,9 +6,9 @@ define void @replace_float_memset_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [2 x float], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[ACCUM_I_FLAT]])
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [2 x float], ptr [[ACCUM_I_FLAT]], i32 0, i32 0
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, ptr [[ACCUM_I_FLAT]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr [2 x float], ptr [[ACCUM_I_FLAT]], i32 0, i32 1
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[ACCUM_I_FLAT]])
 ; CHECK-NEXT:    ret void
@@ -25,9 +25,9 @@ define void @replace_half_memset_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [2 x half], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr half, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [2 x half], ptr [[ACCUM_I_FLAT]], i32 0, i32 0
 ; CHECK-NEXT:    store half 0xH0000, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr half, ptr [[ACCUM_I_FLAT]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr [2 x half], ptr [[ACCUM_I_FLAT]], i32 0, i32 1
 ; CHECK-NEXT:    store half 0xH0000, ptr [[GEP1]], align 2
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
 ; CHECK-NEXT:    ret void
@@ -44,9 +44,9 @@ define void @replace_double_memset_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [2 x double], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[ACCUM_I_FLAT]])
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [2 x double], ptr [[ACCUM_I_FLAT]], i32 0, i32 0
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr double, ptr [[ACCUM_I_FLAT]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr [2 x double], ptr [[ACCUM_I_FLAT]], i32 0, i32 1
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP1]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[ACCUM_I_FLAT]])
 ; CHECK-NEXT:    ret void
@@ -63,9 +63,9 @@ define void @replace_int16_memset_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CACHE_I:%.*]] = alloca [2 x i16], align 2
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[CACHE_I]])
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[CACHE_I]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [2 x i16], ptr [[CACHE_I]], i32 0, i32 0
 ; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i16, ptr [[CACHE_I]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr [2 x i16], ptr [[CACHE_I]], i32 0, i32 1
 ; CHECK-NEXT:    store i16 0, ptr [[GEP1]], align 2
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[CACHE_I]])
 ; CHECK-NEXT:    ret void
@@ -82,7 +82,7 @@ define void @replace_int_memset_test() #0 {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [1 x i32], ptr [[ACCUM_I_FLAT]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
 ; CHECK-NEXT:    ret void
@@ -102,7 +102,7 @@ define void @replace_int_memset_to_var_test() #0 {
 ; CHECK-NEXT:    store i32 1, ptr [[I]], align 4
 ; CHECK-NEXT:    [[I8_LOAD:%.*]] = load i32, ptr [[I]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [1 x i32], ptr [[ACCUM_I_FLAT]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[I8_LOAD]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
index 36fed88fc52d6..151603a7161c5 100644
--- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -19,10 +19,12 @@
 ; CHECK-NEXT:   DXIL Intrinsic Expansion
 ; CHECK-NEXT:   DXIL CBuffer Access
 ; CHECK-NEXT:   DXIL Data Scalarization
-; CHECK-NEXT:   DXIL Array Flattener
 ; CHECK-NEXT:   FunctionPass Manager
 ; CHECK-NEXT:     Dominator Tree Construction
 ; CHECK-NEXT:     Scalarize vector operations
+; CHECK-NEXT:   DXIL Array Flattener
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
 ; CHECK-NEXT:     DXIL Forward Handle Accesses
 ; CHECK-NEXT:     DXIL Legalizer
 ; CHECK-NEXT:   DXIL Resource Binding Analysis
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index d5797f6b51348..0c91c53227763 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -3,43 +3,36 @@
 
 ; Make sure we can load groupshared, static vectors and arrays of vectors
 
-@"arrayofVecData" = local_unnamed_addr addrspace(3) global [2 x <3 x float>] zeroinitializer, align 16
+@"arrayofVecData" = local_unnamed_addr addrspace(3) global [2 x <4 x i32>] zeroinitializer, align 16
 @"vecData" = external addrspace(3) global <4 x i32>, align 4
 @staticArrayOfVecData = internal global [3 x <4 x i32>] [<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> <i32 9, i32 10, i32 11, i32 12>], align 4
-@"groushared2dArrayofVectors" = local_unnamed_addr addrspace(3) global [3 x [ 3 x <4 x i32>]] zeroinitializer, align 16
+@"groupshared2dArrayofVectors" = local_unnamed_addr addrspace(3) global [3 x [3 x <4 x i32>]] zeroinitializer, align 16
 
-; CHECK: @arrayofVecData.scalarized.1dim = local_unnamed_addr addrspace(3) global [6 x float] zeroinitializer, align 16
+; CHECK: @arrayofVecData.scalarized.1dim = local_unnamed_addr addrspace(3) global [8 x i32] zeroinitializer, align 16
 ; CHECK: @vecData.scalarized = external addrspace(3) global [4 x i32], align 4
 ; CHECK: @staticArrayOfVecData.scalarized.1dim = internal global [12 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12], align 4
-; CHECK: @groushared2dArrayofVectors.scalarized.1dim = local_unnamed_addr addrspace(3) global [36 x i32] zeroinitializer, align 16
+; CHECK: @groupshared2dArrayofVectors.scalarized.1dim = local_unnamed_addr addrspace(3) global [36 x i32] zeroinitializer, align 16
 
 ; CHECK-NOT: @arrayofVecData
 ; CHECK-NOT: @arrayofVecData.scalarized
 ; CHECK-NOT: @vecData
 ; CHECK-NOT: @staticArrayOfVecData
 ; CHECK-NOT: @staticArrayOfVecData.scalarized
-; CHECK-NOT: @groushared2dArrayofVectors
-; CHECK-NOT: @groushared2dArrayofVectors.scalarized
+; CHECK-NOT: @groupshared2dArrayofVectors
+; CHECK-NOT: @groupshared2dArrayofVectors.scalarized
 
 define <4 x i32> @load_array_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_array_vec_test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr addrspace(3) @arrayofVecData.scalarized.1dim to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 4), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 5), align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 6), align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 7), align 4
 ; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
 ; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
 ; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
@@ -60,10 +53,11 @@ define <4 x i32> @load_array_vec_test() #0 {
 define <4 x i32> @load_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_vec_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 1), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 2), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 3), align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 3), align 4
 ; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[TMP2]], i32 1
 ; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[TMP3]], i32 2
@@ -74,16 +68,60 @@ define <4 x i32> @load_vec_test() #0 {
   ret <4 x i32> %1
 }
 
+define <4 x i32> @load_vec_from_scalar_gep_test() #0 {
+; CHECK-LABEL: define <4 x i32> @load_vec_from_scalar_gep_test(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DOTI04:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds nuw ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 4), align 4
+; CHECK-NEXT:    [[DOTI116:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 5), align 4
+; CHECK-NEXT:    [[DOTI228:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 6), align 4
+; CHECK-NEXT:    [[DOTI3310:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 7), align 4
+; CHECK-NEXT:    [[DOTUPTO011:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI04]], i32 0
+; CHECK-NEXT:    [[DOTUPTO112:%.*]] = insertelement <4 x i32> [[DOTUPTO011]], i32 [[DOTI116]], i32 1
+; CHECK-NEXT:    [[DOTUPTO213:%.*]] = insertelement <4 x i32> [[DOTUPTO112]], i32 [[DOTI228]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[DOTUPTO213]], i32 [[DOTI3310]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = getelementptr inbounds nuw i32, ptr addrspace(3) @"arrayofVecData", i32 4
+  %2 = load <4 x i32>, ptr addrspace(3) %1, align 4
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @load_vec_from_i8_gep_test() #0 {
+; CHECK-LABEL: define <4 x i32> @load_vec_from_i8_gep_test(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DOTI04:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds nuw ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 4), align 4
+; CHECK-NEXT:    [[DOTI116:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 5), align 4
+; CHECK-NEXT:    [[DOTI228:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 6), align 4
+; CHECK-NEXT:    [[DOTI3310:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 7), align 4
+; CHECK-NEXT:    [[DOTUPTO011:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI04]], i32 0
+; CHECK-NEXT:    [[DOTUPTO112:%.*]] = insertelement <4 x i32> [[DOTUPTO011]], i32 [[DOTI116]], i32 1
+; CHECK-NEXT:    [[DOTUPTO213:%.*]] = insertelement <4 x i32> [[DOTUPTO112]], i32 [[DOTI228]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[DOTUPTO213]], i32 [[DOTI3310]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = getelementptr inbounds nuw i8, ptr addrspace(3) @"arrayofVecData", i32 16
+  %2 = load <4 x i32>, ptr addrspace(3) %1, align 4
+  ret <4 x i32> %2
+}
+
 define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
 ; CHECK-LABEL: define <4 x i32> @load_static_array_of_vec_test(
 ; CHECK-SAME: i32 [[INDEX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 0, [[TMP3]]
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[DOTI0:%.*]] = load i32, ptr [[DOTFLAT]], align 4
-; CHECK-NEXT:    [[DOTFLAT_I1:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 1, [[TMP4]]
+; CHECK-NEXT:    [[DOTFLAT_I1:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP5]]
 ; CHECK-NEXT:    [[DOTI1:%.*]] = load i32, ptr [[DOTFLAT_I1]], align 4
-; CHECK-NEXT:    [[DOTFLAT_I2:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 2, [[TMP6]]
+; CHECK-NEXT:    [[DOTFLAT_I2:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP7]]
 ; CHECK-NEXT:    [[DOTI2:%.*]] = load i32, ptr [[DOTFLAT_I2]], align 4
-; CHECK-NEXT:    [[DOTFLAT_I3:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 3, [[TMP8]]
+; CHECK-NEXT:    [[DOTFLAT_I3:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP9]]
 ; CHECK-NEXT:    [[DOTI3:%.*]] = load i32, ptr [[DOTFLAT_I3]], align 4
 ; CHECK-NEXT:    [[DOTUPTO01:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI0]], i32 0
 ; CHECK-NEXT:    [[DOTUPTO12:%.*]] = insertelement <4 x i32> [[DOTUPTO01]], i32 [[DOTI1]], i32 1
@@ -96,17 +134,86 @@ define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
   ret <4 x i32> %4
 }
 
+define <4 x i32> @load_static_array_of_vec_from_scalar_gep_test(i32 %index) #0 {
+; CHECK-LABEL: define <4 x i32> @load_static_array_of_vec_from_scalar_gep_test(
+; CHECK-SAME: i32 [[INDEX:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 0, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    [[DOTI0:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 1, [[TMP5]]
+; CHECK-NEXT:    [[DOTI14:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[DOTI11:%.*]] = load i32, ptr [[DOTI14]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 2, [[TMP7]]
+; CHECK-NEXT:    [[DOTI25:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP8]]
+; CHECK-NEXT:    [[DOTI22:%.*]] = load i32, ptr [[DOTI25]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 3, [[TMP9]]
+; CHECK-NEXT:    [[DOTI36:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP10]]
+; CHECK-NEXT:    [[DOTI33:%.*]] = load i32, ptr [[DOTI36]], align 4
+; CHECK-NEXT:    [[DOTUPTO07:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI0]], i32 0
+; CHECK-NEXT:    [[DOTUPTO18:%.*]] = insertelement <4 x i32> [[DOTUPTO07]], i32 [[DOTI11]], i32 1
+; CHECK-NEXT:    [[DOTUPTO29:%.*]] = insertelement <4 x i32> [[DOTUPTO18]], i32 [[DOTI22]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[DOTUPTO29]], i32 [[DOTI33]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
+;
+  %2 = mul i32 %index, 4
+  %3 = getelementptr inbounds i32, ptr @staticArrayOfVecData, i32 %2
+  %4 = load <4 x i32>, <4 x i32>* %3, align 4
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @load_static_array_of_vec_from_i8_gep_test(i32 %index) #0 {
+; CHECK-LABEL: define <4 x i32> @load_static_array_of_vec_from_i8_gep_test(
+; CHECK-SAME: i32 [[INDEX:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 0, [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    [[DOTI0:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 1, [[TMP14]]
+; CHECK-NEXT:    [[DOTI14:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[DOTI11:%.*]] = load i32, ptr [[DOTI14]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 2, [[TMP15]]
+; CHECK-NEXT:    [[DOTI25:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP8]]
+; CHECK-NEXT:    [[DOTI22:%.*]] = load i32, ptr [[DOTI25]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 3, [[TMP13]]
+; CHECK-NEXT:    [[DOTI36:%.*]] = getelementptr [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[TMP10]]
+; CHECK-NEXT:    [[DOTI33:%.*]] = load i32, ptr [[DOTI36]], align 4
+; CHECK-NEXT:    [[DOTUPTO07:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI0]], i32 0
+; CHECK-NEXT:    [[DOTUPTO18:%.*]] = insertelement <4 x i32> [[DOTUPTO07]], i32 [[DOTI11]], i32 1
+; CHECK-NEXT:    [[DOTUPTO29:%.*]] = insertelement <4 x i32> [[DOTUPTO18]], i32 [[DOTI22]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[DOTUPTO29]], i32 [[DOTI33]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
+;
+  %2 = mul i32 %index, 12
+  %3 = getelementptr inbounds i8, ptr @staticArrayOfVecData, i32 %2
+  %4 = load <4 x i32>, <4 x i32>* %3, align 4
+  ret <4 x i32> %4
+}
+
 define <4 x i32> @multid_load_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @multid_load_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 2), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3), align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), align 4
-; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1), align 4
-; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2), align 4
-; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3), align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 3), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 16), align 4
+; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 17), align 4
+; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 18), align 4
+; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 19), align 4
 ; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP1]], [[TMP5]]
 ; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP2]], [[DOTI13]]
 ; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP3]], [[DOTI25]]
@@ -117,8 +224,8 @@ define <4 x i32> @multid_load_test() #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
 ; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
-  %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
-  %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groupshared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
+  %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groupshared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
   %3 = add <4 x i32> %1, %2
   ret <4 x i32> %3
 }
diff --git a/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll b/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
index a07ce2c24f7ac..43bbe9249aac0 100644
--- a/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
@@ -8,18 +8,12 @@
 define internal void @main() #1 {
 ; CHECK-LABEL: define internal void @main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 1
-; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
-; CHECK-NEXT:    [[DOTI1:%.*]] = getelementptr float, ptr [[TMP0]], i32 1
-; CHECK-NEXT:    [[DOTI11:%.*]] = load float, ptr [[DOTI1]], align 4
-; CHECK-NEXT:    [[DOTI2:%.*]] = getelementptr float, ptr [[TMP0]], i32 2
-; CHECK-NEXT:    [[DOTI22:%.*]] = load float, ptr [[DOTI2]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 2
-; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
-; CHECK-NEXT:    [[DOTI14:%.*]] = getelementptr float, ptr [[TMP1]], i32 1
-; CHECK-NEXT:    [[DOTI15:%.*]] = load float, ptr [[DOTI14]], align 4
-; CHECK-NEXT:    [[DOTI26:%.*]] = getelementptr float, ptr [[TMP1]], i32 2
-; CHECK-NEXT:    [[DOTI27:%.*]] = load float, ptr [[DOTI26]], align 8
+; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr getelementptr inbounds ([24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 3), align 16
+; CHECK-NEXT:    [[DOTI11:%.*]] = load float, ptr getelementptr (float, ptr getelementptr inbounds ([24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 3), i32 1), align 4
+; CHECK-NEXT:    [[DOTI22:%.*]] = load float, ptr getelementptr (float, ptr getelementptr inbounds ([24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 3), i32 2), align 8
+; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr getelementptr inbounds ([24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 6), align 16
+; CHECK-NEXT:    [[DOTI15:%.*]] = load float, ptr getelementptr (float, ptr getelementptr inbounds ([24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 6), i32 1), align 4
+; CHECK-NEXT:    [[DOTI27:%.*]] = load float, ptr getelementptr (float, ptr getelementptr inbounds ([24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 6), i32 2), align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll
index 7e9fe0e330661..4394235ffe4bd 100644
--- a/llvm/test/CodeGen/DirectX/scalar-store.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-store.ll
@@ -14,17 +14,33 @@
 
 ; CHECK-LABEL: store_array_vec_test
 define void @store_array_vec_test () local_unnamed_addr #0 {
-    ; CHECK-COUNT-6: store float {{1|2|3|4|6}}.000000e+00, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.1dim.*|%.*)}}, align {{4|8|16}}
-    ; CHECK-NEXT: ret void
-    store <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, ptr addrspace(3) @"arrayofVecData", align 16 
-    store <3 x float> <float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>, ptr addrspace(3)   getelementptr inbounds (i8, ptr addrspace(3) @"arrayofVecData", i32 16), align 16 
-    ret void
- } 
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(3) [[GEP]], align 16
+; CHECK-NEXT:    store float 2.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4
+; CHECK-NEXT:    store float 3.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 8
+; CHECK-NEXT:    store float 2.000000e+00, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 16
+; CHECK-NEXT:    store float 4.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 4), align 4
+; CHECK-NEXT:    store float 6.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 5), align 8
+; CHECK-NEXT:    ret void
+;
+  store <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, ptr addrspace(3) @"arrayofVecData", align 16
+  store <3 x float> <float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @"arrayofVecData", i32 12), align 16
+  ret void
+}
 
 ; CHECK-LABEL: store_vec_test
 define void @store_vec_test(<4 x i32> %inputVec) #0 {
-  ; CHECK-COUNT-4: store i32 %inputVec.{{.*}}, ptr addrspace(3) {{(@vecData.scalarized|getelementptr \(i32, ptr addrspace\(3\) @vecData.scalarized, i32 .*\)|%.*)}}, align 4 
-  ; CHECK-NEXT: ret void
+; CHECK-NEXT:    [[INPUTVEC_I01:%.*]] = extractelement <4 x i32> %inputVec, i32 0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0
+; CHECK-NEXT:    store i32 [[INPUTVEC_I01]], ptr addrspace(3) [[GEP]], align 4
+; CHECK-NEXT:    [[INPUTVEC_I12:%.*]] = extractelement <4 x i32> %inputVec, i32 1
+; CHECK-NEXT:    store i32 [[INPUTVEC_I12]], ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[INPUTVEC_I23:%.*]] = extractelement <4 x i32> %inputVec, i32 2
+; CHECK-NEXT:    store i32 [[INPUTVEC_I23]], ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 2), align 4
+; CHECK-NEXT:    [[INPUTVEC_I34:%.*]] = extractelement <4 x i32> %inputVec, i32 3
+; CHECK-NEXT:    store i32 [[INPUTVEC_I34]], ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 3), align 4
+; CHECK-NEXT:    ret void
+;
   store <4 x i32> %inputVec, <4 x i32> addrspace(3)* @"vecData", align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
index 32e2c3ca2c302..a8557e47b0ea6 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
@@ -33,7 +33,9 @@ define void @alloca_2d_gep_test() {
   ; FCHECK:  [[alloca_val:%.*]] = alloca [4 x i32], align 16
   ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
   ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [2 x [2 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
-  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[alloca_val]], i32 0, i32 [[tid]]
+  ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 2
+  ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
+  ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
   ; CHECK: ret void
   %1 = alloca [2 x <2 x i32>], align 16
   %2 = tail call i32 @llvm.dx.thread.id(i32 0)
diff --git a/llvm/test/CodeGen/Hexagon/llvm.exp10.ll b/llvm/test/CodeGen/Hexagon/llvm.exp10.ll
new file mode 100644
index 0000000000000..b5fcc4151225a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/llvm.exp10.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown-linux-musl < %s | FileCheck %s
+
+define half @exp10_f16(half %x) #0 {
+; CHECK-LABEL: exp10_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10f
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %r = call half @llvm.exp10.f16(half %x)
+  ret half %r
+}
+
+define <2 x half> @exp10_v2f16(<2 x half> %x) #0 {
+; CHECK-LABEL: exp10_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     r16 = r1
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#8)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10f
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     r17 = r0
+; CHECK-NEXT:     r0 = r16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10f
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = zxth(r0)
+; CHECK-NEXT:     r2 = zxth(r17)
+; CHECK-NEXT:     r17:16 = memd(r29+#0)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %r = call <2 x half> @llvm.exp10.v2f16(<2 x half> %x)
+  ret <2 x half> %r
+}
+
+define float @exp10_f32(float %x) #0 {
+; CHECK-LABEL: exp10_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10f
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %r = call float @llvm.exp10.f32(float %x)
+  ret float %r
+}
+
+define <2 x float> @exp10_v2f32(<2 x float> %x) #0 {
+; CHECK-LABEL: exp10_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10f
+; CHECK-NEXT:     r16 = r1
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#8)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10f
+; CHECK-NEXT:     r17 = r0
+; CHECK-NEXT:     r0 = r16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = combine(r0,r17)
+; CHECK-NEXT:     r17:16 = memd(r29+#0)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    } // 8-byte Folded Reload
+  %r = call <2 x float> @llvm.exp10.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define double @exp10_f64(double %x) #0 {
+; CHECK-LABEL: exp10_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %r = call double @llvm.exp10.f64(double %x)
+  ret double %r
+}
+
+define <2 x double> @exp10_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: exp10_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r0
+; CHECK-NEXT:     r1:0 = combine(r3,r2)
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#24)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10
+; CHECK-NEXT:     r19:18 = combine(r5,r4)
+; CHECK-NEXT:     memd(r29+#8) = r19:18
+; CHECK-NEXT:     memd(r29+#0) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10
+; CHECK-NEXT:     r21:20 = combine(r1,r0)
+; CHECK-NEXT:     r1:0 = combine(r19,r18)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r1:0
+; CHECK-NEXT:     memd(r16+#0) = r21:20
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#16)
+; CHECK-NEXT:     r19:18 = memd(r29+#8)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r21:20 = memd(r29+#0)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    } // 8-byte Folded Reload
+  %r = call <2 x double> @llvm.exp10.v2f64(<2 x double> %x)
+  ret <2 x double> %r
+}
+
+define fp128 @exp10_f128(fp128 %x) #0 {
+; CHECK-LABEL: exp10_f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r0
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#24)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10l
+; CHECK-NEXT:     r0 = add(r29,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = memd(r29+#0)
+; CHECK-NEXT:     r3:2 = memd(r29+#8)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r3:2
+; CHECK-NEXT:     memd(r16+#0) = r1:0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#16)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    } // 8-byte Folded Reload
+  %r = call fp128 @llvm.exp10.f128(fp128 %x)
+  ret fp128 %r
+}
+
+define <2 x fp128> @exp10_v2f128(<2 x fp128> %x) #0 {
+; CHECK-LABEL: exp10_v2f128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r0
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#56)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = add(r29,#16)
+; CHECK-NEXT:     memd(r29+#40) = r19:18
+; CHECK-NEXT:     memd(r29+#32) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10l
+; CHECK-NEXT:     r19:18 = memd(r29+#64)
+; CHECK-NEXT:     r21:20 = memd(r29+#72)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call exp10l
+; CHECK-NEXT:     r0 = add(r29,#0)
+; CHECK-NEXT:     r3:2 = combine(r19,r18)
+; CHECK-NEXT:     r5:4 = combine(r21,r20)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = memd(r29+#16)
+; CHECK-NEXT:     r3:2 = memd(r29+#24)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r5:4 = memd(r29+#0)
+; CHECK-NEXT:     r7:6 = memd(r29+#8)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#24) = r7:6
+; CHECK-NEXT:     memd(r16+#16) = r5:4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r3:2
+; CHECK-NEXT:     memd(r16+#0) = r1:0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#48)
+; CHECK-NEXT:     r19:18 = memd(r29+#40)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r21:20 = memd(r29+#32)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    } // 8-byte Folded Reload
+  %r = call <2 x fp128> @llvm.exp10.v2f128(<2 x fp128> %x)
+  ret <2 x fp128> %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/llvm.frexp.ll b/llvm/test/CodeGen/Hexagon/llvm.frexp.ll
new file mode 100644
index 0000000000000..0b52e051cf251
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/llvm.frexp.ll
@@ -0,0 +1,803 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown-linux-musl < %s | FileCheck %s
+
+define { half, i32 } @test_frexp_f16_i32(half %a) nounwind {
+; CHECK-LABEL: test_frexp_f16_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(r29+#4)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
+  ret { half, i32 } %result
+}
+
+define half @test_frexp_f16_i32_only_use_fract(half %a) nounwind {
+; CHECK-LABEL: test_frexp_f16_i32_only_use_fract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
+  %result.0 = extractvalue { half, i32 } %result, 0
+  ret half %result.0
+}
+
+define i32 @test_frexp_f16_i32_only_use_exp(half %a) nounwind {
+; CHECK-LABEL: test_frexp_f16_i32_only_use_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#4)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
+  %result.0 = extractvalue { half, i32 } %result, 1
+  ret i32 %result.0
+}
+
+define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = combine(r2,r0)
+; CHECK-NEXT:     r0 = r3
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#32)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     memd(r29+#16) = r19:18
+; CHECK-NEXT:     memd(r29+#8) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     r18 = r0
+; CHECK-NEXT:     r0 = r17
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:     r20 = memw(r29+#4)
+; CHECK-NEXT:     r21 = memw(r29+#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:     r17 = r0
+; CHECK-NEXT:     r0 = r18
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memh(r16+#2) = r0
+; CHECK-NEXT:     memh(r16+#0) = r17
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r21:20
+; CHECK-NEXT:     r17:16 = memd(r29+#24)
+; CHECK-EMPTY:
+; CHECK-NEXT:    } :mem_noshuf // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r19:18 = memd(r29+#16)
+; CHECK-NEXT:     r21:20 = memd(r29+#8)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
+  ret { <2 x half>, <2 x i32> } %result
+}
+
+define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     r16 = r1
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#16)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     r17 = r0
+; CHECK-NEXT:     r0 = r16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __truncsfhf2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = zxth(r0)
+; CHECK-NEXT:     r2 = zxth(r17)
+; CHECK-NEXT:     r17:16 = memd(r29+#8)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
+  %result.0 = extractvalue { <2 x half>, <2 x i32> } %result, 0
+  ret <2 x half> %result.0
+}
+
+define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     r16 = r1
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#16)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call __extendhfsf2
+; CHECK-NEXT:     r0 = r16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#4)
+; CHECK-NEXT:     r17:16 = memd(r29+#8)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(r29+#0)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
+  %result.1 = extractvalue { <2 x half>, <2 x i32> } %result, 1
+  ret <2 x i32> %result.1
+}
+
+define { <3 x float>, <3 x i32> } @test_frexp_v3f32_v3i32(<3 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v3f32_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = combine(r4,r0)
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#32)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:     r18 = r3
+; CHECK-NEXT:     memd(r29+#16) = r19:18
+; CHECK-NEXT:     memd(r29+#8) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r18
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:     r20 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r16,#24)
+; CHECK-NEXT:     r21 = r0
+; CHECK-NEXT:     r0 = r17
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = memw(r29+#4)
+; CHECK-NEXT:     r3 = memw(r29+#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#0) = r21:20
+; CHECK-NEXT:     memd(r16+#16) = r3:2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memw(r16+#8) = r0
+; CHECK-NEXT:     r17:16 = memd(r29+#24)
+; CHECK-EMPTY:
+; CHECK-NEXT:    } :mem_noshuf // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r19:18 = memd(r29+#16)
+; CHECK-NEXT:     r21:20 = memd(r29+#8)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> %a)
+  ret { <3 x float>, <3 x i32> } %result
+}
+
+define { float, i32 } @test_frexp_f32_i32(float %a) nounwind {
+; CHECK-LABEL: test_frexp_f32_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(r29+#4)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  ret { float, i32 } %result
+}
+
+define { float, i32 } @test_frexp_f32_i32_tailcall(float %a) nounwind {
+; CHECK-LABEL: test_frexp_f32_i32_tailcall:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(r29+#4)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = tail call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  ret { float, i32 } %result
+}
+
+define float @test_frexp_f32_i32_only_use_fract(float %a) nounwind {
+; CHECK-LABEL: test_frexp_f32_i32_only_use_fract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  %result.0 = extractvalue { float, i32 } %result, 0
+  ret float %result.0
+}
+
+define i32 @test_frexp_f32_i32_only_use_exp(float %a) nounwind {
+; CHECK-LABEL: test_frexp_f32_i32_only_use_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#4)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  %result.0 = extractvalue { float, i32 } %result, 1
+  ret i32 %result.0
+}
+
+define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = combine(r3,r0)
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#24)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:     memd(r29+#8) = r19:18
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r17
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:     r18 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = combine(r0,r18)
+; CHECK-NEXT:     r2 = memw(r29+#4)
+; CHECK-NEXT:     r3 = memw(r29+#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#0) = r1:0
+; CHECK-NEXT:     memd(r16+#8) = r3:2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#16)
+; CHECK-NEXT:     r19:18 = memd(r29+#8)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
+  ret { <2 x float>, <2 x i32> } %result
+}
+
+define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r1
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#16)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r16
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:     r17 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = combine(r0,r17)
+; CHECK-NEXT:     r17:16 = memd(r29+#8)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    } // 8-byte Folded Reload
+  %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
+  %result.0 = extractvalue { <2 x float>, <2 x i32> } %result, 0
+  ret <2 x float> %result.0
+}
+
+define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r1
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#16)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:     r0 = r16
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#4)
+; CHECK-NEXT:     r17:16 = memd(r29+#8)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(r29+#0)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
+  %result.1 = extractvalue { <2 x float>, <2 x i32> } %result, 1
+  ret <2 x i32> %result.1
+}
+
+define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v4f32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = combine(r5,r0)
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#48)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r29,#12)
+; CHECK-NEXT:     r19:18 = combine(r3,r4)
+; CHECK-NEXT:     memd(r29+#32) = r19:18
+; CHECK-NEXT:     memd(r29+#24) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     memd(r29+#16) = r23:22
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r19
+; CHECK-NEXT:     r1 = add(r29,#8)
+; CHECK-NEXT:     r20 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = r18
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:     r19 = r0
+; CHECK-NEXT:     r22 = memw(r29+#12)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r23 = memw(r29+#8)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r17
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:     r18 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = combine(r0,r18)
+; CHECK-NEXT:     r5:4 = combine(r19,r20)
+; CHECK-NEXT:     r2 = memw(r29+#4)
+; CHECK-NEXT:     r3 = memw(r29+#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r1:0
+; CHECK-NEXT:     memd(r16+#0) = r5:4
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#24) = r3:2
+; CHECK-NEXT:     memd(r16+#16) = r23:22
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#40)
+; CHECK-NEXT:     r19:18 = memd(r29+#32)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r21:20 = memd(r29+#24)
+; CHECK-NEXT:     r23:22 = memd(r29+#16)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
+  ret { <4 x float>, <4 x i32> } %result
+}
+
+define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v4f32_v4i32_only_use_fract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = combine(r5,r0)
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#40)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = add(r29,#12)
+; CHECK-NEXT:     r19:18 = combine(r3,r4)
+; CHECK-NEXT:     memd(r29+#24) = r19:18
+; CHECK-NEXT:     memd(r29+#16) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r19
+; CHECK-NEXT:     r1 = add(r29,#8)
+; CHECK-NEXT:     r20 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r18
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:     r19 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r0 = r17
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:     r18 = r0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1:0 = combine(r0,r18)
+; CHECK-NEXT:     r3:2 = combine(r19,r20)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r1:0
+; CHECK-NEXT:     memd(r16+#0) = r3:2
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#32)
+; CHECK-NEXT:     r19:18 = memd(r29+#24)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r21:20 = memd(r29+#16)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    } // 8-byte Folded Reload
+  %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
+  %result.0 = extractvalue { <4 x float>, <4 x i32> } %result, 0
+  ret <4 x float> %result.0
+}
+
+define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = combine(r5,r0)
+; CHECK-NEXT:     r0 = r2
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#32)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r19:18 = combine(r3,r4)
+; CHECK-NEXT:     r1 = add(r29,#12)
+; CHECK-NEXT:     memd(r29+#16) = r19:18
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#8)
+; CHECK-NEXT:     r0 = r19
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#4)
+; CHECK-NEXT:     r0 = r18
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexpf
+; CHECK-NEXT:     r1 = add(r29,#0)
+; CHECK-NEXT:     r0 = r17
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#12)
+; CHECK-NEXT:     r2 = memw(r29+#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(r29+#8)
+; CHECK-NEXT:     r3 = memw(r29+#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r3:2
+; CHECK-NEXT:     memd(r16+#0) = r1:0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#24)
+; CHECK-NEXT:     r19:18 = memd(r29+#16)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
+  %result.1 = extractvalue { <4 x float>, <4 x i32> } %result, 1
+  ret <4 x i32> %result.1
+}
+
+define { double, i32 } @test_frexp_f64_i32(double %a) nounwind {
+; CHECK-LABEL: test_frexp_f64_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r0
+; CHECK-NEXT:     r1:0 = combine(r3,r2)
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#8)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:     r2 = add(r16,#8)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#0) = r1:0
+; CHECK-NEXT:     r17:16 = memd(r29+#0)
+; CHECK-EMPTY:
+; CHECK-NEXT:    } :mem_noshuf // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
+  ret { double, i32 } %result
+}
+
+define double @test_frexp_f64_i32_only_use_fract(double %a) nounwind {
+; CHECK-LABEL: test_frexp_f64_i32_only_use_fract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:     r2 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
+  %result.0 = extractvalue { double, i32 } %result, 0
+  ret double %result.0
+}
+
+define i32 @test_frexp_f64_i32_only_use_exp(double %a) nounwind {
+; CHECK-LABEL: test_frexp_f64_i32_only_use_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:     r2 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#4)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
+  %result.0 = extractvalue { double, i32 } %result, 1
+  ret i32 %result.0
+}
+
+define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r0
+; CHECK-NEXT:     r1:0 = combine(r3,r2)
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#32)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r29,#4)
+; CHECK-NEXT:     r19:18 = combine(r5,r4)
+; CHECK-NEXT:     memd(r29+#16) = r19:18
+; CHECK-NEXT:     memd(r29+#8) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:     r21:20 = combine(r1,r0)
+; CHECK-NEXT:     r1:0 = combine(r19,r18)
+; CHECK-NEXT:     r2 = add(r29,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = memw(r29+#4)
+; CHECK-NEXT:     r3 = memw(r29+#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r1:0
+; CHECK-NEXT:     memd(r16+#0) = r21:20
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#16) = r3:2
+; CHECK-NEXT:     r17:16 = memd(r29+#24)
+; CHECK-EMPTY:
+; CHECK-NEXT:    } :mem_noshuf // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r19:18 = memd(r29+#16)
+; CHECK-NEXT:     r21:20 = memd(r29+#8)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r31:30 = dealloc_return(r30):raw
+; CHECK-NEXT:    }
+  %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
+  ret { <2 x double>, <2 x i32> } %result
+}
+
+define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r16 = r0
+; CHECK-NEXT:     r1:0 = combine(r3,r2)
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#32)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r2 = add(r29,#4)
+; CHECK-NEXT:     r19:18 = combine(r5,r4)
+; CHECK-NEXT:     memd(r29+#16) = r19:18
+; CHECK-NEXT:     memd(r29+#8) = r21:20
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:     r21:20 = combine(r1,r0)
+; CHECK-NEXT:     r1:0 = combine(r19,r18)
+; CHECK-NEXT:     r2 = add(r29,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     memd(r16+#8) = r1:0
+; CHECK-NEXT:     memd(r16+#0) = r21:20
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = memd(r29+#24)
+; CHECK-NEXT:     r19:18 = memd(r29+#16)
+; CHECK-NEXT:    } // 8-byte Folded Reload
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r21:20 = memd(r29+#8)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    } // 8-byte Folded Reload
+  %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
+  %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0
+  ret <2 x double> %result.0
+}
+
+define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r17:16 = combine(r3,r2)
+; CHECK-NEXT:     memd(r29+#-16) = r17:16
+; CHECK-NEXT:     allocframe(#16)
+; CHECK-NEXT:    } // 8-byte Folded Spill
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:     r2 = add(r29,#4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call frexp
+; CHECK-NEXT:     r1:0 = combine(r17,r16)
+; CHECK-NEXT:     r2 = add(r29,#0)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#4)
+; CHECK-NEXT:     r17:16 = memd(r29+#8)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(r29+#0)
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+  %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
+  %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1
+  ret <2 x i32> %result.1
+}
+
+declare { float, i32 } @llvm.frexp.f32.i32(float) #0
+declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>) #0
+declare { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float>) #0
+
+declare { half, i32 } @llvm.frexp.f16.i32(half) #0
+declare { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half>) #0
+
+declare { double, i32 } @llvm.frexp.f64.i32(double) #0
+declare { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double>) #0
+
+declare { half, i16 } @llvm.frexp.f16.i16(half) #0
+declare { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
new file mode 100644
index 0000000000000..ff50f1abe5897
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: r{{[0-9]+}} = asr(r{{[0-9]+}},#{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}}:{{[0-9]+}} = mpyu(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}} += mpyi(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = mpy(r{{[0-9]+}},r{{[0-9]+}})
+
+; ModuleID = '39544.c'
+source_filename = "39544.c"
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @mul_n(i64* nocapture %p, i32* nocapture readonly %a, i32 %k, i32 %n) local_unnamed_addr {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %k to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %arrayidx.phi = phi i32* [ %a, %for.body.lr.ph ], [ %arrayidx.inc, %for.body ]
+  %arrayidx2.phi = phi i64* [ %p, %for.body.lr.ph ], [ %arrayidx2.inc, %for.body ]
+  %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %conv = sext i32 %0 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  store i64 %mul, i64* %arrayidx2.phi, align 8
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx2.inc = getelementptr i64, i64* %arrayidx2.phi, i32 1
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/thread-pointer.ll b/llvm/test/CodeGen/Hexagon/thread-pointer.ll
new file mode 100644
index 0000000000000..cebb9ff3e202d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/thread-pointer.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+;
+; This test verifies the thread pointer intrinsic implementation for Hexagon.
+; The thread pointer (UGP register) is used to access thread-local storage.
+
+declare ptr @llvm.thread.pointer() nounwind readnone
+
+define ptr @thread_pointer() nounwind {
+; CHECK-LABEL: thread_pointer:
+; CHECK: // %bb.0:
+; CHECK:         r0 = ugp
+; CHECK-NEXT:    jumpr r31
+  %1 = tail call ptr @llvm.thread.pointer()
+  ret ptr %1
+}
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
index d07e2914c753a..f7653af1fa9ba 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
@@ -122,23 +122,23 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 define i64 @caller_large_scalars() nounwind {
 ; CHECK-LABEL: caller_large_scalars:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $zero, $sp, 24
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $zero, $sp, 40
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 8
+; CHECK-NEXT:    vst $vr0, $sp, 24
 ; CHECK-NEXT:    ori $a0, $zero, 2
-; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 56
-; CHECK-NEXT:    vst $vr0, $sp, 40
+; CHECK-NEXT:    st.d $a0, $sp, 16
+; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    vst $vr0, $sp, 56
 ; CHECK-NEXT:    ori $a2, $zero, 1
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    addi.d $a1, $sp, 0
-; CHECK-NEXT:    st.d $a2, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 48
+; CHECK-NEXT:    addi.d $a1, $sp, 16
+; CHECK-NEXT:    st.d $a2, $sp, 48
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars(i256 1, i256 2)
   ret i64 %1
@@ -177,20 +177,20 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-LABEL: caller_large_scalars_exhausted_regs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $a0, $sp, 16
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    st.d $a0, $sp, 8
 ; CHECK-NEXT:    ori $a0, $zero, 9
 ; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 40
+; CHECK-NEXT:    st.d $zero, $sp, 56
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 24
+; CHECK-NEXT:    vst $vr0, $sp, 40
 ; CHECK-NEXT:    ori $a0, $zero, 10
-; CHECK-NEXT:    st.d $a0, $sp, 16
-; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    st.d $a0, $sp, 32
+; CHECK-NEXT:    st.d $zero, $sp, 88
 ; CHECK-NEXT:    ori $a0, $zero, 8
-; CHECK-NEXT:    st.d $a0, $sp, 48
+; CHECK-NEXT:    st.d $a0, $sp, 64
 ; CHECK-NEXT:    ori $a0, $zero, 1
 ; CHECK-NEXT:    ori $a1, $zero, 2
 ; CHECK-NEXT:    ori $a2, $zero, 3
@@ -198,12 +198,12 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-NEXT:    ori $a4, $zero, 5
 ; CHECK-NEXT:    ori $a5, $zero, 6
 ; CHECK-NEXT:    ori $a6, $zero, 7
-; CHECK-NEXT:    addi.d $a7, $sp, 48
-; CHECK-NEXT:    vst $vr0, $sp, 56
+; CHECK-NEXT:    addi.d $a7, $sp, 64
+; CHECK-NEXT:    vst $vr0, $sp, 72
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars_exhausted_regs)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars_exhausted_regs(
       i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i256 8, i64 9,
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
index c88b67f13d1e7..da8c3e93f6842 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
@@ -1252,8 +1252,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64F-LP64S-LABEL: caller_half_on_stack:
 ; LA64F-LP64S:       # %bb.0:
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64F-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64F-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64F-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64F-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1292,8 +1292,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64F-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64F-LP64S-NEXT:    ret
 ;
 ; LA64F-LP64D-LABEL: caller_half_on_stack:
@@ -1336,8 +1336,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64D-LP64S-LABEL: caller_half_on_stack:
 ; LA64D-LP64S:       # %bb.0:
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64D-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64D-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64D-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64D-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1376,8 +1376,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64D-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64D-LP64S-NEXT:    ret
 ;
 ; LA64D-LP64D-LABEL: caller_half_on_stack:
diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
index 52d8dd05aaa4c..1a9de3b0ef3d1 100644
--- a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
@@ -14,41 +14,41 @@
 define dso_local noundef signext i32 @main() nounwind {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -272
-; CHECK-NEXT:    st.d $ra, $sp, 264 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -288
+; CHECK-NEXT:    st.d $ra, $sp, 280 # 8-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
 ; CHECK-NEXT:    xvld $xr0, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    xvst $xr0, $sp, 96 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 112 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
 ; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
-; CHECK-NEXT:    xvst $xr1, $sp, 64 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr1, $sp, 80 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_2)
 ; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI0_2)
-; CHECK-NEXT:    xvst $xr2, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr2, $sp, 48 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_3)
 ; CHECK-NEXT:    xvld $xr3, $a0, %pc_lo12(.LCPI0_3)
-; CHECK-NEXT:    xvst $xr3, $sp, 0 # 32-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvst $xr1, $sp, 168
-; CHECK-NEXT:    xvst $xr2, $sp, 200
-; CHECK-NEXT:    xvst $xr3, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvst $xr3, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvst $xr1, $sp, 184
+; CHECK-NEXT:    xvst $xr2, $sp, 216
+; CHECK-NEXT:    xvst $xr3, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(foo)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 96 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 168
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 200
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvld $xr0, $sp, 112 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 184
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 216
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(bar)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 264 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 272
+; CHECK-NEXT:    ld.d $ra, $sp, 280 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 288
 ; CHECK-NEXT:    ret
 entry:
   %s = alloca %struct.S, align 2
diff --git a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
index ccc5c703e71ed..15ac95dfc6c55 100644
--- a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
+++ b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
@@ -28,12 +28,12 @@ define void @func() {
 ; CHECK-NEXT:    ld.w $a3, $a1, 0
 ; CHECK-NEXT:    ld.w $a2, $a1, 0
 ; CHECK-NEXT:    ld.w $a0, $a1, 0
-; CHECK-NEXT:    st.d $fp, $sp, 0
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
 ; CHECK-NEXT:    lu12i.w $fp, 1
 ; CHECK-NEXT:    ori $fp, $fp, 12
 ; CHECK-NEXT:    add.d $fp, $sp, $fp
 ; CHECK-NEXT:    st.w $t8, $fp, 0
-; CHECK-NEXT:    ld.d $fp, $sp, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
 ; CHECK-NEXT:    st.w $t8, $a1, 0
 ; CHECK-NEXT:    st.w $t7, $a1, 0
 ; CHECK-NEXT:    st.w $t6, $a1, 0
diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll
index 048703029d8c6..b29d8634854f3 100644
--- a/llvm/test/CodeGen/LoongArch/frame.ll
+++ b/llvm/test/CodeGen/LoongArch/frame.ll
@@ -1,5 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=loongarch64 -mattr=+d < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,-lsx < %s | FileCheck %s --check-prefixes=CHECK,NOLSX
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LSX
 
 %struct.key_t = type { i32, [16 x i8] }
 
@@ -7,20 +8,35 @@ declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
 declare void @test1(ptr)
 
 define i32 @test() nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -32
-; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; CHECK-NEXT:    st.w $zero, $sp, 16
-; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 4
-; CHECK-NEXT:    pcaddu18i $ra, %call36(test1)
-; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 32
-; CHECK-NEXT:    ret
+; NOLSX-LABEL: test:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -32
+; NOLSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NOLSX-NEXT:    st.w $zero, $sp, 16
+; NOLSX-NEXT:    st.d $zero, $sp, 8
+; NOLSX-NEXT:    st.d $zero, $sp, 0
+; NOLSX-NEXT:    addi.d $a0, $sp, 4
+; NOLSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; NOLSX-NEXT:    jirl $ra, $ra, 0
+; NOLSX-NEXT:    move $a0, $zero
+; NOLSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NOLSX-NEXT:    addi.d $sp, $sp, 32
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -32
+; LSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LSX-NEXT:    st.w $zero, $sp, 16
+; LSX-NEXT:    vrepli.b $vr0, 0
+; LSX-NEXT:    vst $vr0, $sp, 0
+; LSX-NEXT:    addi.d $a0, $sp, 4
+; LSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; LSX-NEXT:    jirl $ra, $ra, 0
+; LSX-NEXT:    move $a0, $zero
+; LSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LSX-NEXT:    addi.d $sp, $sp, 32
+; LSX-NEXT:    ret
   %key = alloca %struct.key_t, align 4
   call void @llvm.memset.p0.i64(ptr %key, i8 0, i64 20, i1 false)
   %1 = getelementptr inbounds %struct.key_t, ptr %key, i64 0, i32 1, i64 0
@@ -98,3 +114,62 @@ define void @test_large_frame_size_1234576() "frame-pointer"="all" {
   %1 = alloca i8, i32 1234567
   ret void
 }
+
+;; Note: will create an emergency spill slot, if (!isInt<7>(StackSize)).
+;; Should involve only one SP-adjusting addi per adjustment.
+;; LSX 112 + 16(emergency solt) = 128
+define void @test_frame_size_112() {
+; NOLSX-LABEL: test_frame_size_112:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -112
+; NOLSX-NEXT:    .cfi_def_cfa_offset 112
+; NOLSX-NEXT:    addi.d $sp, $sp, 112
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_112:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -128
+; LSX-NEXT:    .cfi_def_cfa_offset 128
+; LSX-NEXT:    addi.d $sp, $sp, 128
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 112
+  ret void
+}
+
+;; LSX 128 + 16(emergency solt) = 144
+define void @test_frame_size_128() {
+; NOLSX-LABEL: test_frame_size_128:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -128
+; NOLSX-NEXT:    .cfi_def_cfa_offset 128
+; NOLSX-NEXT:    addi.d $sp, $sp, 128
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_128:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -144
+; LSX-NEXT:    .cfi_def_cfa_offset 144
+; LSX-NEXT:    addi.d $sp, $sp, 144
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 128
+  ret void
+}
+
+;; LSX 144 + 16(emergency solt) = 160
+define void @test_frame_size_144() {
+; NOLSX-LABEL: test_frame_size_144:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -144
+; NOLSX-NEXT:    .cfi_def_cfa_offset 144
+; NOLSX-NEXT:    addi.d $sp, $sp, 144
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_144:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -160
+; LSX-NEXT:    .cfi_def_cfa_offset 160
+; LSX-NEXT:    addi.d $sp, $sp, 160
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 144
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
index 402ddb9ad941b..5a55b253c77bb 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -6,11 +6,11 @@
 define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) {
 ; CHECK-LABEL: box:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-NEXT:    slli.d $a2, $a1, 5
 ; CHECK-NEXT:    alsl.d $a1, $a1, $a2, 4
-; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 16
 ; CHECK-NEXT:    add.d $a3, $a2, $a1
 ; CHECK-NEXT:    vldx $vr0, $a1, $a2
 ; CHECK-NEXT:    vld $vr1, $a3, 32
@@ -18,7 +18,7 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $a0, 32
 ; CHECK-NEXT:    vst $vr2, $a0, 16
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = alloca [2 x %Box], align 16
   %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index b06f6523e977c..f25e988b52dc9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -250,84 +250,68 @@ define void @buildvector_v32i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 14
 ; CHECK-NEXT:    ld.b $a1, $sp, 72
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 15
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    ld.b $a2, $sp, 80
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT:    ld.b $a1, $sp, 80
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
 ; CHECK-NEXT:    ld.b $a1, $sp, 88
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT:    ld.b $a1, $sp, 96
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 1
+; CHECK-NEXT:    ld.b $a2, $sp, 96
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 3
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 2
 ; CHECK-NEXT:    ld.b $a1, $sp, 104
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT:    ld.b $a1, $sp, 112
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 3
+; CHECK-NEXT:    ld.b $a2, $sp, 112
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 5
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 4
 ; CHECK-NEXT:    ld.b $a1, $sp, 120
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT:    ld.b $a1, $sp, 128
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 5
+; CHECK-NEXT:    ld.b $a2, $sp, 128
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 6
 ; CHECK-NEXT:    ld.b $a1, $sp, 136
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT:    ld.b $a1, $sp, 144
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 7
+; CHECK-NEXT:    ld.b $a2, $sp, 144
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 9
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 8
 ; CHECK-NEXT:    ld.b $a1, $sp, 152
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT:    ld.b $a1, $sp, 160
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 9
+; CHECK-NEXT:    ld.b $a2, $sp, 160
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 11
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 10
 ; CHECK-NEXT:    ld.b $a1, $sp, 168
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT:    ld.b $a1, $sp, 176
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 11
+; CHECK-NEXT:    ld.b $a2, $sp, 176
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 13
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 12
 ; CHECK-NEXT:    ld.b $a1, $sp, 184
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 13
+; CHECK-NEXT:    ld.b $a2, $sp, 192
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT:    ld.b $a1, $sp, 192
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 15
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 15
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
@@ -371,8 +355,15 @@ entry:
 define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
 ; CHECK-LABEL: buildvector_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    ld.h $t0, $sp, 8
-; CHECK-NEXT:    ld.h $t1, $sp, 0
+; CHECK-NEXT:    ld.h $t0, $sp, 64
+; CHECK-NEXT:    ld.h $t1, $sp, 56
+; CHECK-NEXT:    ld.h $t2, $sp, 48
+; CHECK-NEXT:    ld.h $t3, $sp, 40
+; CHECK-NEXT:    ld.h $t4, $sp, 32
+; CHECK-NEXT:    ld.h $t5, $sp, 24
+; CHECK-NEXT:    ld.h $t6, $sp, 16
+; CHECK-NEXT:    ld.h $t7, $sp, 8
+; CHECK-NEXT:    ld.h $t8, $sp, 0
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 1
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 2
@@ -380,45 +371,30 @@ define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i1
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a5, 4
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a6, 5
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a7, 6
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $t1, 7
-; CHECK-NEXT:    ld.h $a1, $sp, 16
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $t0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $t8, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t7, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a2, $sp, 24
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t6, 1
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a1, $sp, 32
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t5, 2
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a2, $sp, 40
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t4, 3
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a1, $sp, 48
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 4
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t3, 4
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a2, $sp, 56
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t2, 5
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    ld.h $a1, $sp, 64
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 6
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t1, 6
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $t0, 7
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
index 789b51d9b5e5b..9528280d181a3 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -6,10 +6,10 @@ declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32)
 define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
@@ -18,79 +18,79 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 3
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 4
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 4
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 5
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 5
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 6
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 6
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 7
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b)
@@ -102,10 +102,10 @@ declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32)
 define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
@@ -114,39 +114,39 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 3
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll
index 86808c7a8f014..09ce1a04d6c9d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll
@@ -5,8 +5,6 @@ define i32 @bitcast_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa0, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <8 x float> %a, i32 7
@@ -18,8 +16,6 @@ define i64 @bitcast_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa0, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x double> %a, i32 3
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index 04214f5dfa9d2..2e1618748688a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -76,21 +76,21 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
 define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 0
 ; CHECK-NEXT:    ld.b $a0, $a0, 0
 ; CHECK-NEXT:    st.b $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %e = extractelement <32 x i8> %v, i32 %idx
@@ -101,21 +101,21 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 1
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %e = extractelement <16 x i16> %v, i32 %idx
@@ -126,21 +126,21 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %e = extractelement <8 x i32> %v, i32 %idx
@@ -151,21 +151,21 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    st.d $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %e = extractelement <4 x i64> %v, i32 %idx
@@ -176,21 +176,21 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fld.s $fa0, $a0, 0
 ; CHECK-NEXT:    fst.s $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %e = extractelement <8 x float> %v, i32 %idx
@@ -201,21 +201,21 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fld.d $fa0, $a0, 0
 ; CHECK-NEXT:    fst.d $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %e = extractelement <4 x double> %v, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f3bec11810e9b..f154dd3b8eb3c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,20 +7,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT:    movgr2fr.d $fa3, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa3
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 1
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa0, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 2
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 3
 ; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll
index 7b2461b11f12d..b37b525981fd9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll
@@ -4,8 +4,6 @@
 define <8 x float> @insert_bitcast_v8f32(<8 x float> %a, i32 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -17,8 +15,6 @@ entry:
 define <4 x double> @insert_bitcast_v4f64(<4 x double> %a, i64 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index 3fdc439e68679..271e3eca31dbe 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -4,18 +4,9 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a0, $sp, 31
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vpickve2gr.b $a0, $vr1, 15
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <32 x i8> %a, i32 31
@@ -26,18 +17,9 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a0, $sp, 30
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vpickve2gr.h $a0, $vr1, 7
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <16 x i16> %a, i32 15
@@ -61,8 +43,6 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -87,8 +67,6 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
index 88c3e4367ffa7..4e173c4feadba 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
@@ -4,23 +4,7 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a1, $sp, 31
-; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    xvextrins.b $xr0, $xr0, 31
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <32 x i8> %a, i32 15
@@ -33,23 +17,7 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a1, $sp, 30
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    xvextrins.h $xr0, $xr0, 23
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <16 x i16> %a, i32 7
@@ -62,10 +30,7 @@ entry:
 define <8 x i32> @insert_extract_v8i32(<8 x i32> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 5
+; CHECK-NEXT:    xvextrins.w $xr0, $xr0, 19
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <8 x i32> %a, i32 3
@@ -78,14 +43,7 @@ entry:
 define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa2, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    movfr2gr.s $a0, $fa2
-; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 5
+; CHECK-NEXT:    xvextrins.w $xr0, $xr0, 19
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <8 x float> %a, i32 3
@@ -98,10 +56,7 @@ entry:
 define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 2
+; CHECK-NEXT:    xvextrins.d $xr0, $xr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <4 x i64> %a, i32 1
@@ -114,14 +69,7 @@ entry:
 define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 2
+; CHECK-NEXT:    xvextrins.d $xr0, $xr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 25106b456d2f7..b24f95e676d10 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -18,8 +18,7 @@ define void @insert_32xi8_upper(ptr %src, ptr %dst, i8 %in) nounwind {
 ; CHECK-LABEL: insert_32xi8_upper:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
@@ -47,8 +46,7 @@ define void @insert_16xi16_upper(ptr %src, ptr %dst, i16 %in) nounwind {
 ; CHECK-LABEL: insert_16xi16_upper:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
-; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
 ; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
@@ -116,22 +114,22 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind {
 define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
 ; CHECK-NEXT:    st.b $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx
@@ -142,22 +140,22 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
 ; CHECK-NEXT:    st.h $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx
@@ -168,22 +166,22 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
 ; CHECK-NEXT:    st.w $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx
@@ -194,22 +192,22 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
 ; CHECK-NEXT:    st.d $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx
@@ -220,22 +218,22 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fst.s $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %v_new = insertelement <8 x float> %v, float %in, i32 %idx
@@ -246,22 +244,22 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
 define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fst.d $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %v_new = insertelement <4 x double> %v, double %in, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
new file mode 100644
index 0000000000000..7a52531daa802
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
@@ -0,0 +1,362 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 -mattr=+d < %s | FileCheck -check-prefix=LA32 %s
+; RUN: llc -mtriple=loongarch64 -mattr=+d < %s | FileCheck -check-prefix=LA64 %s
+
+define half @exp10_f16(half %x) #0 {
+; LA32-LABEL: exp10_f16:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    bl __extendhfsf2
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    bl __truncsfhf2
+; LA32-NEXT:    movfr2gr.s $a0, $fa0
+; LA32-NEXT:    lu12i.w $a1, -16
+; LA32-NEXT:    or $a0, $a0, $a1
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_f16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(__extendhfsf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfhf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    lu12i.w $a1, -16
+; LA64-NEXT:    or $a0, $a0, $a1
+; LA64-NEXT:    movgr2fr.w $fa0, $a0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+  %r = call half @llvm.exp10.f16(half %x)
+  ret half %r
+}
+
+define <2 x half> @exp10_v2f16(<2 x half> %x) #0 {
+; LA32-LABEL: exp10_v2f16:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 0 # 8-byte Folded Spill
+; LA32-NEXT:    movgr2fr.w $fs0, $a1
+; LA32-NEXT:    movgr2fr.w $fa0, $a0
+; LA32-NEXT:    bl __extendhfsf2
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    bl __truncsfhf2
+; LA32-NEXT:    movfr2gr.s $fp, $fa0
+; LA32-NEXT:    fmov.s $fa0, $fs0
+; LA32-NEXT:    bl __extendhfsf2
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    bl __truncsfhf2
+; LA32-NEXT:    movfr2gr.s $a1, $fa0
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    fld.d $fs0, $sp, 0 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f16:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -32
+; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a0
+; LA64-NEXT:    movgr2fr.w $fa0, $a1
+; LA64-NEXT:    pcaddu18i $ra, %call36(__extendhfsf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfhf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $sp, 2
+; LA64-NEXT:    movgr2fr.w $fa0, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(__extendhfsf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__truncsfhf2)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.s $a0, $fa0
+; LA64-NEXT:    st.h $a0, $sp, 0
+; LA64-NEXT:    vld $vr0, $sp, 0
+; LA64-NEXT:    vpickve2gr.h $a0, $vr0, 0
+; LA64-NEXT:    vpickve2gr.h $a1, $vr0, 1
+; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    ret
+  %r = call <2 x half> @llvm.exp10.v2f16(<2 x half> %x)
+  ret <2 x half> %r
+}
+
+define float @exp10_f32(float %x) #0 {
+; LA32-LABEL: exp10_f32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b exp10f
+;
+; LA64-LABEL: exp10_f32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(exp10f)
+; LA64-NEXT:    jr $t8
+  %r = call float @llvm.exp10.f32(float %x)
+  ret float %r
+}
+
+define <2 x float> @exp10_v2f32(<2 x float> %x) #0 {
+; LA32-LABEL: exp10_v2f32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
+; LA32-NEXT:    fst.d $fs1, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT:    fmov.s $fs0, $fa1
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    fmov.s $fs1, $fa0
+; LA32-NEXT:    fmov.s $fa0, $fs0
+; LA32-NEXT:    bl exp10f
+; LA32-NEXT:    fmov.s $fa1, $fa0
+; LA32-NEXT:    fmov.s $fa0, $fs1
+; LA32-NEXT:    fld.d $fs1, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
+; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
+; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+  %r = call <2 x float> @llvm.exp10.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define double @exp10_f64(double %x) #0 {
+; LA32-LABEL: exp10_f64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b exp10
+;
+; LA64-LABEL: exp10_f64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(exp10)
+; LA64-NEXT:    jr $t8
+  %r = call double @llvm.exp10.f64(double %x)
+  ret double %r
+}
+
+define <2 x double> @exp10_v2f64(<2 x double> %x) #0 {
+; LA32-LABEL: exp10_v2f64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
+; LA32-NEXT:    fst.d $fs1, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT:    fmov.d $fs0, $fa1
+; LA32-NEXT:    bl exp10
+; LA32-NEXT:    fmov.d $fs1, $fa0
+; LA32-NEXT:    fmov.d $fa0, $fs0
+; LA32-NEXT:    bl exp10
+; LA32-NEXT:    fmov.d $fa1, $fa0
+; LA32-NEXT:    fmov.d $fa0, $fs1
+; LA32-NEXT:    fld.d $fs1, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.d $a0, $fa0
+; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    movfr2gr.d $a0, $fa0
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 1
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+  %r = call <2 x double> @llvm.exp10.v2f64(<2 x double> %x)
+  ret <2 x double> %r
+}
+
+define fp128 @exp10_f128(fp128 %x) #0 {
+; LA32-LABEL: exp10_f128:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -48
+; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a2, $a1, 0
+; LA32-NEXT:    ld.w $a3, $a1, 4
+; LA32-NEXT:    ld.w $a4, $a1, 8
+; LA32-NEXT:    ld.w $a1, $a1, 12
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    st.w $a1, $sp, 20
+; LA32-NEXT:    st.w $a4, $sp, 16
+; LA32-NEXT:    st.w $a3, $sp, 12
+; LA32-NEXT:    addi.w $a0, $sp, 24
+; LA32-NEXT:    addi.w $a1, $sp, 8
+; LA32-NEXT:    st.w $a2, $sp, 8
+; LA32-NEXT:    bl exp10l
+; LA32-NEXT:    ld.w $a0, $sp, 36
+; LA32-NEXT:    ld.w $a1, $sp, 32
+; LA32-NEXT:    ld.w $a2, $sp, 28
+; LA32-NEXT:    ld.w $a3, $sp, 24
+; LA32-NEXT:    st.w $a0, $fp, 12
+; LA32-NEXT:    st.w $a1, $fp, 8
+; LA32-NEXT:    st.w $a2, $fp, 4
+; LA32-NEXT:    st.w $a3, $fp, 0
+; LA32-NEXT:    ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 48
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_f128:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10l)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+  %r = call fp128 @llvm.exp10.f128(fp128 %x)
+  ret fp128 %r
+}
+
+define <2 x fp128> @exp10_v2f128(<2 x fp128> %x) #0 {
+; LA32-LABEL: exp10_v2f128:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -96
+; LA32-NEXT:    st.w $ra, $sp, 92 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 88 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 84 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 80 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $s0, $a1, 16
+; LA32-NEXT:    ld.w $s1, $a1, 20
+; LA32-NEXT:    ld.w $s2, $a1, 24
+; LA32-NEXT:    ld.w $s3, $a1, 28
+; LA32-NEXT:    ld.w $a2, $a1, 0
+; LA32-NEXT:    ld.w $a3, $a1, 4
+; LA32-NEXT:    ld.w $a4, $a1, 8
+; LA32-NEXT:    ld.w $a1, $a1, 12
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    st.w $a1, $sp, 20
+; LA32-NEXT:    st.w $a4, $sp, 16
+; LA32-NEXT:    st.w $a3, $sp, 12
+; LA32-NEXT:    addi.w $a0, $sp, 24
+; LA32-NEXT:    addi.w $a1, $sp, 8
+; LA32-NEXT:    st.w $a2, $sp, 8
+; LA32-NEXT:    bl exp10l
+; LA32-NEXT:    st.w $s3, $sp, 52
+; LA32-NEXT:    st.w $s2, $sp, 48
+; LA32-NEXT:    st.w $s1, $sp, 44
+; LA32-NEXT:    addi.w $a0, $sp, 56
+; LA32-NEXT:    addi.w $a1, $sp, 40
+; LA32-NEXT:    st.w $s0, $sp, 40
+; LA32-NEXT:    bl exp10l
+; LA32-NEXT:    ld.w $a0, $sp, 24
+; LA32-NEXT:    ld.w $a1, $sp, 28
+; LA32-NEXT:    ld.w $a2, $sp, 32
+; LA32-NEXT:    ld.w $a3, $sp, 36
+; LA32-NEXT:    ld.w $a4, $sp, 68
+; LA32-NEXT:    ld.w $a5, $sp, 64
+; LA32-NEXT:    ld.w $a6, $sp, 60
+; LA32-NEXT:    ld.w $a7, $sp, 56
+; LA32-NEXT:    st.w $a4, $fp, 28
+; LA32-NEXT:    st.w $a5, $fp, 24
+; LA32-NEXT:    st.w $a6, $fp, 20
+; LA32-NEXT:    st.w $a7, $fp, 16
+; LA32-NEXT:    st.w $a3, $fp, 12
+; LA32-NEXT:    st.w $a2, $fp, 8
+; LA32-NEXT:    st.w $a1, $fp, 4
+; LA32-NEXT:    st.w $a0, $fp, 0
+; LA32-NEXT:    ld.w $s3, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 80 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 84 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 88 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 92 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 96
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: exp10_v2f128:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -48
+; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s3, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    ld.d $fp, $a1, 16
+; LA64-NEXT:    ld.d $s0, $a1, 24
+; LA64-NEXT:    ld.d $a2, $a1, 0
+; LA64-NEXT:    ld.d $a1, $a1, 8
+; LA64-NEXT:    move $s1, $a0
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10l)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    move $s2, $a0
+; LA64-NEXT:    move $s3, $a1
+; LA64-NEXT:    move $a0, $fp
+; LA64-NEXT:    move $a1, $s0
+; LA64-NEXT:    pcaddu18i $ra, %call36(exp10l)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    st.d $a1, $s1, 24
+; LA64-NEXT:    st.d $a0, $s1, 16
+; LA64-NEXT:    st.d $s3, $s1, 8
+; LA64-NEXT:    st.d $s2, $s1, 0
+; LA64-NEXT:    ld.d $s3, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 48
+; LA64-NEXT:    ret
+  %r = call <2 x fp128> @llvm.exp10.v2f128(<2 x fp128> %x)
+  ret <2 x fp128> %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
index ffedd7f9e9438..648c19d509715 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
@@ -347,42 +347,42 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr1, $vr0, $vr1
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
@@ -439,48 +439,48 @@ define { <3 x float>, <3 x float> } @test_sincos_v3f32(<3 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v3f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -96
-; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -112
+; LA64-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 72
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 88
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 68
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 84
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 64
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 80
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 56
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 72
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 52
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 68
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 48
-; LA64-NEXT:    vld $vr0, $sp, 64
-; LA64-NEXT:    vld $vr1, $sp, 48
-; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 96
+; LA64-NEXT:    fst.s $fa0, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 80
+; LA64-NEXT:    vld $vr1, $sp, 64
+; LA64-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 112
 ; LA64-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.sincos.v3f32(<3 x float> %a)
   ret { <3 x float>, <3 x float> } %result
@@ -568,44 +568,44 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 1
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vinsgr2vr.d $vr1, $a0, 1
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
@@ -801,17 +801,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -80
-; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s6, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s7, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -96
+; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s3, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s4, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s5, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s6, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s7, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    ld.d $fp, $a1, 16
 ; LA64-NEXT:    ld.d $s0, $a1, 24
 ; LA64-NEXT:    ld.d $s1, $a1, 0
@@ -847,17 +847,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ; LA64-NEXT:    st.d $s6, $s3, 16
 ; LA64-NEXT:    st.d $s5, $s3, 8
 ; LA64-NEXT:    st.d $s4, $s3, 0
-; LA64-NEXT:    ld.d $s7, $sp, 0 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s6, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ld.d $s7, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s6, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s5, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s4, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s3, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s2, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 96
 ; LA64-NEXT:    ret
   %result = call { <2 x fp128>, <2 x fp128> } @llvm.sincos.v2f128(<2 x fp128> %a)
   ret { <2 x fp128>, <2 x fp128> } %result
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll
index df4896d7ec936..9a40feb45671f 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll
@@ -4,8 +4,7 @@
 define i32 @bitcast_extract_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x float> %a, i32 3
@@ -16,8 +15,7 @@ entry:
 define i64 @bitcast_extract_v2f64(<2 x double> %a) nounwind {
 ; CHECK-LABEL: bitcast_extract_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <2 x double> %a, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll
index a20d17efdfb11..c42e3013c1131 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll
@@ -4,8 +4,6 @@
 define <4 x float> @insert_bitcast_v4f32(<4 x float> %a, i32 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -17,8 +15,6 @@ entry:
 define <2 x double> @insert_bitcast_v2f64(<2 x double> %a, i64 %b) nounwind {
 ; CHECK-LABEL: insert_bitcast_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
index c7dd1454c7e33..e9a0c8a110452 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
@@ -4,8 +4,7 @@
 define <16 x i8> @insert_extract_v16i8(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
+; CHECK-NEXT:    vextrins.b $vr0, $vr0, 31
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <16 x i8> %a, i32 15
@@ -16,8 +15,7 @@ entry:
 define <8 x i16> @insert_extract_v8i16(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
+; CHECK-NEXT:    vextrins.h $vr0, $vr0, 23
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <8 x i16> %a, i32 7
@@ -28,8 +26,7 @@ entry:
 define <4 x i32> @insert_extract_v4i32(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 1
+; CHECK-NEXT:    vextrins.w $vr0, $vr0, 19
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x i32> %a, i32 3
@@ -40,9 +37,7 @@ entry:
 define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 3
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vextrins.w $vr0, $vr0, 3
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <4 x float> %a, i32 3
@@ -53,8 +48,7 @@ entry:
 define <2 x i64> @insert_extract_v2i64(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v2i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vextrins.d $vr0, $vr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <2 x i64> %a, i32 1
@@ -65,9 +59,7 @@ entry:
 define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vextrins.d $vr0, $vr0, 1
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <2 x double> %a, i32 1
diff --git a/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
new file mode 100644
index 0000000000000..96159e5884d3f
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx --verify-machineinstrs < %s | FileCheck %s
+define void @eliminate_frame_index(<16 x i8> %a) nounwind {
+; CHECK-LABEL: eliminate_frame_index:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -240
+; CHECK-NEXT:    st.d $ra, $sp, 232 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 224 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 216 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 208 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 200 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s3, $sp, 192 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s4, $sp, 184 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s5, $sp, 176 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s6, $sp, 168 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s7, $sp, 160 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s8, $sp, 152 # 8-byte Folded Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $zero, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $ra, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $tp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $fp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    st.d $a0, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    vstelm.b $vr0, $a0, 0, 0
+; CHECK-NEXT:    ld.d $a0, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $zero
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $ra
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $tp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $fp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ld.d $s8, $sp, 152 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s7, $sp, 160 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s6, $sp, 168 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s5, $sp, 176 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s4, $sp, 184 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s3, $sp, 192 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s2, $sp, 200 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 208 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 216 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 224 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 232 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 240
+; CHECK-NEXT:    ret
+  %s = alloca [16 x i8]
+  %ss = alloca [128 x i8]
+
+  %zero =  call i64 asm sideeffect "addi.d $$zero, $$zero, 1", "={r0}"()
+  %ra =  call i64 asm sideeffect "addi.d $$ra, $$zero, 1", "={r1}"()
+  %tp =  call i64 asm sideeffect "addi.d $$tp, $$zero, 1", "={r2}"()
+  %a0 =  call i64 asm sideeffect "addi.d $$a0, $$zero, 1", "={r4}"()
+  %a1 =  call i64 asm sideeffect "addi.d $$a1, $$zero, 1", "={r5}"()
+  %a2 =  call i64 asm sideeffect "addi.d $$a2, $$zero, 1", "={r6}"()
+  %a3 =  call i64 asm sideeffect "addi.d $$a3, $$zero, 1", "={r7}"()
+  %a4 =  call i64 asm sideeffect "addi.d $$a4, $$zero, 1", "={r8}"()
+  %a5 =  call i64 asm sideeffect "addi.d $$a5, $$zero, 1", "={r9}"()
+  %a6 =  call i64 asm sideeffect "addi.d $$a6, $$zero, 1", "={r10}"()
+  %a7 =  call i64 asm sideeffect "addi.d $$a7, $$zero, 1", "={r11}"()
+  %t0 =  call i64 asm sideeffect "addi.d $$t0, $$zero, 1", "={r12}"()
+  %t1 =  call i64 asm sideeffect "addi.d $$t1, $$zero, 1", "={r13}"()
+  %t2 =  call i64 asm sideeffect "addi.d $$t2, $$zero, 1", "={r14}"()
+  %t3 =  call i64 asm sideeffect "addi.d $$t3, $$zero, 1", "={r15}"()
+  %t4 =  call i64 asm sideeffect "addi.d $$t4, $$zero, 1", "={r16}"()
+  %t5 =  call i64 asm sideeffect "addi.d $$t5, $$zero, 1", "={r17}"()
+  %t6 =  call i64 asm sideeffect "addi.d $$t6, $$zero, 1", "={r18}"()
+  %t7 =  call i64 asm sideeffect "addi.d $$t7, $$zero, 1", "={r19}"()
+  %t8 =  call i64 asm sideeffect "addi.d $$t8, $$zero, 1", "={r20}"()
+  ;; r21 Reserved (Non-allocatable)
+  %s9 =  call i64 asm sideeffect "addi.d $$s9, $$zero, 1", "={r22}"()
+  %s0 =  call i64 asm sideeffect "addi.d $$s0, $$zero, 1", "={r23}"()
+  %s1 =  call i64 asm sideeffect "addi.d $$s1, $$zero, 1", "={r24}"()
+  %s2 =  call i64 asm sideeffect "addi.d $$s2, $$zero, 1", "={r25}"()
+  %s3 =  call i64 asm sideeffect "addi.d $$s3, $$zero, 1", "={r26}"()
+  %s4 =  call i64 asm sideeffect "addi.d $$s4, $$zero, 1", "={r27}"()
+  %s5 =  call i64 asm sideeffect "addi.d $$s5, $$zero, 1", "={r28}"()
+  %s6 =  call i64 asm sideeffect "addi.d $$s6, $$zero, 1", "={r29}"()
+  %s7 =  call i64 asm sideeffect "addi.d $$s7, $$zero, 1", "={r30}"()
+  %s8 =  call i64 asm sideeffect "addi.d $$s8, $$zero, 1", "={r31}"()
+
+  %e = extractelement <16 x i8> %a, i64 0
+
+  store volatile i8 %e, ptr %s
+
+  call void asm sideeffect "# reg use $0", "{r0}"(i64 %zero)
+  call void asm sideeffect "# reg use $0", "{r1}"(i64 %ra)
+  call void asm sideeffect "# reg use $0", "{r2}"(i64 %tp)
+  call void asm sideeffect "# reg use $0", "{r4}"(i64 %a0)
+  call void asm sideeffect "# reg use $0", "{r5}"(i64 %a1)
+  call void asm sideeffect "# reg use $0", "{r6}"(i64 %a2)
+  call void asm sideeffect "# reg use $0", "{r7}"(i64 %a3)
+  call void asm sideeffect "# reg use $0", "{r8}"(i64 %a4)
+  call void asm sideeffect "# reg use $0", "{r9}"(i64 %a5)
+  call void asm sideeffect "# reg use $0", "{r10}"(i64 %a6)
+  call void asm sideeffect "# reg use $0", "{r11}"(i64 %a7)
+  call void asm sideeffect "# reg use $0", "{r12}"(i64 %t0)
+  call void asm sideeffect "# reg use $0", "{r13}"(i64 %t1)
+  call void asm sideeffect "# reg use $0", "{r14}"(i64 %t2)
+  call void asm sideeffect "# reg use $0", "{r15}"(i64 %t3)
+  call void asm sideeffect "# reg use $0", "{r16}"(i64 %t4)
+  call void asm sideeffect "# reg use $0", "{r17}"(i64 %t5)
+  call void asm sideeffect "# reg use $0", "{r18}"(i64 %t6)
+  call void asm sideeffect "# reg use $0", "{r19}"(i64 %t7)
+  call void asm sideeffect "# reg use $0", "{r20}"(i64 %t8)
+  ;; r21 Reserved (Non-allocatable)
+  call void asm sideeffect "# reg use $0", "{r22}"(i64 %s9)
+  call void asm sideeffect "# reg use $0", "{r23}"(i64 %s0)
+  call void asm sideeffect "# reg use $0", "{r24}"(i64 %s1)
+  call void asm sideeffect "# reg use $0", "{r25}"(i64 %s2)
+  call void asm sideeffect "# reg use $0", "{r26}"(i64 %s3)
+  call void asm sideeffect "# reg use $0", "{r27}"(i64 %s4)
+  call void asm sideeffect "# reg use $0", "{r28}"(i64 %s5)
+  call void asm sideeffect "# reg use $0", "{r29}"(i64 %s6)
+  call void asm sideeffect "# reg use $0", "{r30}"(i64 %s7)
+  call void asm sideeffect "# reg use $0", "{r31}"(i64 %s8)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/stack-protector-target.ll b/llvm/test/CodeGen/LoongArch/stack-protector-target.ll
new file mode 100644
index 0000000000000..ea4569e198d02
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/stack-protector-target.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32-linux-gnu < %s | FileCheck -check-prefix=LINUX32 %s
+; RUN: llc -mtriple=loongarch64-linux-gnu < %s | FileCheck -check-prefix=LINUX64 %s
+; RUN: llc -mtriple=loongarch32-unknown-openbsd < %s | FileCheck -check-prefix=OPENBSD32 %s
+; RUN: llc -mtriple=loongarch64-unknown-openbsd < %s | FileCheck -check-prefix=OPENBSD64 %s
+
+define void @func() sspreq nounwind {
+; LINUX32-LABEL: func:
+; LINUX32:       # %bb.0:
+; LINUX32-NEXT:    addi.w $sp, $sp, -16
+; LINUX32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LINUX32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LINUX32-NEXT:    pcalau12i $fp, %pc_hi20(__stack_chk_guard)
+; LINUX32-NEXT:    ld.w $a0, $fp, %pc_lo12(__stack_chk_guard)
+; LINUX32-NEXT:    st.w $a0, $sp, 4
+; LINUX32-NEXT:    addi.w $a0, $sp, 0
+; LINUX32-NEXT:    bl capture
+; LINUX32-NEXT:    ld.w $a0, $fp, %pc_lo12(__stack_chk_guard)
+; LINUX32-NEXT:    ld.w $a1, $sp, 4
+; LINUX32-NEXT:    bne $a0, $a1, .LBB0_2
+; LINUX32-NEXT:  # %bb.1:
+; LINUX32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LINUX32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LINUX32-NEXT:    addi.w $sp, $sp, 16
+; LINUX32-NEXT:    ret
+; LINUX32-NEXT:  .LBB0_2:
+; LINUX32-NEXT:    bl __stack_chk_fail
+;
+; LINUX64-LABEL: func:
+; LINUX64:       # %bb.0:
+; LINUX64-NEXT:    addi.d $sp, $sp, -32
+; LINUX64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LINUX64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LINUX64-NEXT:    pcalau12i $fp, %pc_hi20(__stack_chk_guard)
+; LINUX64-NEXT:    ld.d $a0, $fp, %pc_lo12(__stack_chk_guard)
+; LINUX64-NEXT:    st.d $a0, $sp, 8
+; LINUX64-NEXT:    addi.d $a0, $sp, 4
+; LINUX64-NEXT:    pcaddu18i $ra, %call36(capture)
+; LINUX64-NEXT:    jirl $ra, $ra, 0
+; LINUX64-NEXT:    ld.d $a0, $fp, %pc_lo12(__stack_chk_guard)
+; LINUX64-NEXT:    ld.d $a1, $sp, 8
+; LINUX64-NEXT:    bne $a0, $a1, .LBB0_2
+; LINUX64-NEXT:  # %bb.1:
+; LINUX64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LINUX64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LINUX64-NEXT:    addi.d $sp, $sp, 32
+; LINUX64-NEXT:    ret
+; LINUX64-NEXT:  .LBB0_2:
+; LINUX64-NEXT:    pcaddu18i $ra, %call36(__stack_chk_fail)
+; LINUX64-NEXT:    jirl $ra, $ra, 0
+;
+; OPENBSD32-LABEL: func:
+; OPENBSD32:       # %bb.0:
+; OPENBSD32-NEXT:    addi.w $sp, $sp, -16
+; OPENBSD32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; OPENBSD32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; OPENBSD32-NEXT:    pcalau12i $fp, %pc_hi20(__guard_local)
+; OPENBSD32-NEXT:    ld.w $a0, $fp, %pc_lo12(__guard_local)
+; OPENBSD32-NEXT:    st.w $a0, $sp, 4
+; OPENBSD32-NEXT:    addi.w $a0, $sp, 0
+; OPENBSD32-NEXT:    bl capture
+; OPENBSD32-NEXT:    ld.w $a0, $fp, %pc_lo12(__guard_local)
+; OPENBSD32-NEXT:    ld.w $a1, $sp, 4
+; OPENBSD32-NEXT:    bne $a0, $a1, .LBB0_2
+; OPENBSD32-NEXT:  # %bb.1: # %SP_return
+; OPENBSD32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; OPENBSD32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; OPENBSD32-NEXT:    addi.w $sp, $sp, 16
+; OPENBSD32-NEXT:    ret
+; OPENBSD32-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; OPENBSD32-NEXT:    pcalau12i $a0, %pc_hi20(.LSSH)
+; OPENBSD32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LSSH)
+; OPENBSD32-NEXT:    bl __stack_smash_handler
+;
+; OPENBSD64-LABEL: func:
+; OPENBSD64:       # %bb.0:
+; OPENBSD64-NEXT:    addi.d $sp, $sp, -32
+; OPENBSD64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; OPENBSD64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; OPENBSD64-NEXT:    pcalau12i $fp, %pc_hi20(__guard_local)
+; OPENBSD64-NEXT:    ld.d $a0, $fp, %pc_lo12(__guard_local)
+; OPENBSD64-NEXT:    st.d $a0, $sp, 8
+; OPENBSD64-NEXT:    addi.d $a0, $sp, 4
+; OPENBSD64-NEXT:    pcaddu18i $ra, %call36(capture)
+; OPENBSD64-NEXT:    jirl $ra, $ra, 0
+; OPENBSD64-NEXT:    ld.d $a0, $fp, %pc_lo12(__guard_local)
+; OPENBSD64-NEXT:    ld.d $a1, $sp, 8
+; OPENBSD64-NEXT:    bne $a0, $a1, .LBB0_2
+; OPENBSD64-NEXT:  # %bb.1: # %SP_return
+; OPENBSD64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; OPENBSD64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; OPENBSD64-NEXT:    addi.d $sp, $sp, 32
+; OPENBSD64-NEXT:    ret
+; OPENBSD64-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; OPENBSD64-NEXT:    pcalau12i $a0, %pc_hi20(.LSSH)
+; OPENBSD64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LSSH)
+; OPENBSD64-NEXT:    pcaddu18i $ra, %call36(__stack_smash_handler)
+; OPENBSD64-NEXT:    jirl $ra, $ra, 0
+  %alloca = alloca i32, align 4
+  call void @capture(ptr %alloca)
+  ret void
+}
+
+declare void @capture(ptr)
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
index 9f15604fcca6b..69995a0721f8a 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
@@ -36,15 +36,15 @@ define void @caller(i32 %n) {
 ;
 ; LA64-LABEL: caller:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s8, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s8, $sp, 104 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
 ; LA64-NEXT:    .cfi_offset 31, -24
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
 ; LA64-NEXT:    move $s8, $sp
@@ -54,14 +54,14 @@ define void @caller(i32 %n) {
 ; LA64-NEXT:    slli.d $a0, $a0, 4
 ; LA64-NEXT:    sub.d $a0, $sp, $a0
 ; LA64-NEXT:    move $sp, $a0
-; LA64-NEXT:    addi.d $a1, $s8, 0
+; LA64-NEXT:    addi.d $a1, $s8, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $s8, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $s8, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, i32 %n
   %2 = alloca i32, align 64
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
index 0645339358b64..0188884543adb 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -28,22 +28,22 @@ define void @caller32() {
 ;
 ; LA64-LABEL: caller32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -32
-; LA64-NEXT:    .cfi_def_cfa_offset 32
-; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -64
+; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 32
+; LA64-NEXT:    addi.d $fp, $sp, 64
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 4, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 32
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -32
-; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    addi.d $sp, $fp, -64
+; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 64
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 32
   call void @callee(ptr %1)
@@ -102,22 +102,22 @@ define void @caller64() {
 ;
 ; LA64-LABEL: caller64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 64
   call void @callee(ptr %1)
@@ -176,22 +176,22 @@ define void @caller128() {
 ;
 ; LA64-LABEL: caller128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -128
-; LA64-NEXT:    .cfi_def_cfa_offset 128
-; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -256
+; LA64-NEXT:    .cfi_def_cfa_offset 256
+; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 128
+; LA64-NEXT:    addi.d $fp, $sp, 256
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 6, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 128
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -128
-; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 128
+; LA64-NEXT:    addi.d $sp, $fp, -256
+; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 256
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 128
   call void @callee(ptr %1)
@@ -250,22 +250,22 @@ define void @caller256() {
 ;
 ; LA64-LABEL: caller256:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -256
-; LA64-NEXT:    .cfi_def_cfa_offset 256
-; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -512
+; LA64-NEXT:    .cfi_def_cfa_offset 512
+; LA64-NEXT:    st.d $ra, $sp, 504 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 496 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 256
+; LA64-NEXT:    addi.d $fp, $sp, 512
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 7, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 256
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -256
-; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 256
+; LA64-NEXT:    addi.d $sp, $fp, -512
+; LA64-NEXT:    ld.d $fp, $sp, 496 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 504 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 512
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 256
   call void @callee(ptr %1)
diff --git a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
index 925fdf3d60646..0d441e66a0c84 100644
--- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
@@ -121,19 +121,19 @@ define void @t3() {
 ;
 ; LA64-LABEL: t3:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    .cfi_def_cfa_offset 80
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.L.str)
 ; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.L.str)
 ; LA64-NEXT:    ld.h $a1, $a0, 20
 ; LA64-NEXT:    ld.w $a2, $a0, 16
 ; LA64-NEXT:    ld.d $a3, $a0, 8
 ; LA64-NEXT:    ld.d $a0, $a0, 0
-; LA64-NEXT:    st.h $a1, $sp, 20
-; LA64-NEXT:    st.w $a2, $sp, 16
-; LA64-NEXT:    st.d $a3, $sp, 8
-; LA64-NEXT:    st.d $a0, $sp, 0
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    st.h $a1, $sp, 36
+; LA64-NEXT:    st.w $a2, $sp, 32
+; LA64-NEXT:    st.d $a3, $sp, 24
+; LA64-NEXT:    st.d $a0, $sp, 16
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
 entry:
   %msgbuf = alloca [64 x i8], align 1
diff --git a/llvm/test/CodeGen/LoongArch/vararg.ll b/llvm/test/CodeGen/LoongArch/vararg.ll
index 939cd2015c5b1..bc4b8a77c7e15 100644
--- a/llvm/test/CodeGen/LoongArch/vararg.ll
+++ b/llvm/test/CodeGen/LoongArch/vararg.ll
@@ -47,7 +47,7 @@ define i64 @va1(ptr %fmt, ...) {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -94,7 +94,7 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -112,11 +112,11 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-LABEL: va1_va_arg_alloca:
 ; LA64-FPELIM:       # %bb.0:
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -96
-; LA64-FPELIM-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 32
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -112
+; LA64-FPELIM-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 48
 ; LA64-FPELIM-NEXT:    move $s0, $a1
 ; LA64-FPELIM-NEXT:    st.d $a7, $fp, 56
 ; LA64-FPELIM-NEXT:    st.d $a6, $fp, 48
@@ -126,7 +126,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    st.d $a2, $fp, 16
 ; LA64-FPELIM-NEXT:    st.d $a1, $fp, 8
 ; LA64-FPELIM-NEXT:    addi.d $a0, $fp, 16
-; LA64-FPELIM-NEXT:    st.d $a0, $fp, -32
+; LA64-FPELIM-NEXT:    st.d $a0, $fp, -40
 ; LA64-FPELIM-NEXT:    addi.d $a0, $a1, 15
 ; LA64-FPELIM-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-FPELIM-NEXT:    sub.d $a0, $sp, $a0
@@ -134,20 +134,20 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-FPELIM-NEXT:    jirl $ra, $ra, 0
 ; LA64-FPELIM-NEXT:    move $a0, $s0
-; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -32
-; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 96
+; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -48
+; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 112
 ; LA64-FPELIM-NEXT:    ret
 ;
 ; LA64-WITHFP-LABEL: va1_va_arg_alloca:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -96
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 32
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 48
 ; LA64-WITHFP-NEXT:    move $s0, $a1
 ; LA64-WITHFP-NEXT:    st.d $a7, $fp, 56
 ; LA64-WITHFP-NEXT:    st.d $a6, $fp, 48
@@ -157,7 +157,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a0, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a0, $fp, -32
+; LA64-WITHFP-NEXT:    st.d $a0, $fp, -40
 ; LA64-WITHFP-NEXT:    addi.d $a0, $a1, 15
 ; LA64-WITHFP-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-WITHFP-NEXT:    sub.d $a0, $sp, $a0
@@ -165,11 +165,11 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
 ; LA64-WITHFP-NEXT:    move $a0, $s0
-; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -32
-; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
+; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -48
+; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
 ; LA64-WITHFP-NEXT:    ret
   %va = alloca ptr, align 8
   call void @llvm.va_start(ptr %va)
@@ -314,10 +314,10 @@ define void @va_aligned_stack_caller() nounwind {
 ;
 ; LA64-WITHFP-LABEL: va_aligned_stack_caller:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 96 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 112
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -128
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 128
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 17
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 48
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 16
@@ -336,23 +336,23 @@ define void @va_aligned_stack_caller() nounwind {
 ; LA64-WITHFP-NEXT:    lu32i.d $a0, 335544
 ; LA64-WITHFP-NEXT:    lu52i.d $a0, $a0, -328
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 16
-; LA64-WITHFP-NEXT:    st.d $zero, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $zero, $fp, -40
 ; LA64-WITHFP-NEXT:    vrepli.b $vr0, 0
-; LA64-WITHFP-NEXT:    vst $vr0, $fp, -40
+; LA64-WITHFP-NEXT:    vst $vr0, $fp, -56
 ; LA64-WITHFP-NEXT:    ori $a5, $zero, 1000
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 1
 ; LA64-WITHFP-NEXT:    ori $a1, $zero, 11
-; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -48
+; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -64
 ; LA64-WITHFP-NEXT:    ori $a3, $zero, 12
 ; LA64-WITHFP-NEXT:    ori $a4, $zero, 13
 ; LA64-WITHFP-NEXT:    ori $a7, $zero, 1
-; LA64-WITHFP-NEXT:    st.d $a5, $fp, -48
+; LA64-WITHFP-NEXT:    st.d $a5, $fp, -64
 ; LA64-WITHFP-NEXT:    move $a6, $zero
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(va_aligned_stack_callee)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 96 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 128
 ; LA64-WITHFP-NEXT:    ret
   %1 = call i32 (i32, ...) @va_aligned_stack_callee(i32 1, i32 11,
     i256 1000, i32 12, i32 13, i128 18446744073709551616, i32 14,
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/target-flags.mir b/llvm/test/CodeGen/MIR/AMDGPU/target-flags.mir
index db9eed3b223cf..0de6114cd7d14 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/target-flags.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/target-flags.mir
@@ -22,13 +22,23 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: flags
-    ; CHECK: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 12, implicit-def dead $scc
-    ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo
-    ; CHECK: S_ENDPGM 0
+    ; CHECK: liveins: $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 12, implicit-def dead $scc
+    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @foo
+    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @foo
+    ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 target-flags(amdgpu-abs64) @foo
+    ; CHECK-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 target-flags(amdgpu-rel64) @foo
+    ; CHECK-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 target-flags(amdgpu-gotprel64) @foo
+    ; CHECK-NEXT: S_ENDPGM 0
     %0 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 12, implicit-def dead $scc
     %1 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo
     %2:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @foo
     %3:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @foo
+    %4:sreg_64 = S_MOV_B64 target-flags(amdgpu-abs64) @foo
+    %5:sreg_64 = S_MOV_B64 target-flags(amdgpu-rel64) @foo
+    %6:sreg_64 = S_MOV_B64 target-flags(amdgpu-gotprel64) @foo
 
     S_ENDPGM 0
 ...
diff --git a/llvm/test/CodeGen/Mips/fastcc.ll b/llvm/test/CodeGen/Mips/fastcc.ll
index e6d12664d83f4..4dbadbb24ea86 100644
--- a/llvm/test/CodeGen/Mips/fastcc.ll
+++ b/llvm/test/CodeGen/Mips/fastcc.ll
@@ -1,6 +1,4 @@
 ; RUN: llc < %s -mtriple=mipsel -relocation-model=pic | FileCheck %s
-; RUN: llc < %s -mtriple=mipsel-none-nacl-gnu -relocation-model=pic -mips-tail-calls=1\
-; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
 ; RUN: llc < %s -mtriple=mipsel -mcpu=mips32 -mattr=+nooddspreg -relocation-model=pic -mips-tail-calls=1| FileCheck %s -check-prefix=NOODDSPREG
 ; RUN: llc < %s -mtriple=mipsel -mcpu=mips32r2 -mattr=+fp64,+nooddspreg -relocation-model=pic -mips-tail-calls=1 | FileCheck %s -check-prefix=FP64-NOODDSPREG
 
@@ -103,11 +101,6 @@ entry:
 ; CHECK: lw  $5
 ; CHECK: lw  $4
 
-; t6, t7 and t8 are reserved in NaCl and cannot be used for fastcc.
-; CHECK-NACL-NOT: lw  $14
-; CHECK-NACL-NOT: lw  $15
-; CHECK-NACL-NOT: lw  $24
-
   %0 = load i32, ptr @gi0, align 4
   %1 = load i32, ptr @gi1, align 4
   %2 = load i32, ptr @gi2, align 4
@@ -146,11 +139,6 @@ entry:
 ; CHECK-DAG: sw  $24
 ; CHECK-DAG: sw  $3
 
-; t6, t7 and t8 are reserved in NaCl and cannot be used for fastcc.
-; CHECK-NACL-NOT: sw  $14
-; CHECK-NACL-NOT: sw  $15
-; CHECK-NACL-NOT: sw  $24
-
   store i32 %a0, ptr @g0, align 4
   store i32 %a1, ptr @g1, align 4
   store i32 %a2, ptr @g2, align 4
diff --git a/llvm/test/CodeGen/Mips/fp-indexed-ls.ll b/llvm/test/CodeGen/Mips/fp-indexed-ls.ll
index 8e20c8229847c..d1ec8fcbf3720 100644
--- a/llvm/test/CodeGen/Mips/fp-indexed-ls.ll
+++ b/llvm/test/CodeGen/Mips/fp-indexed-ls.ll
@@ -6,9 +6,6 @@
 ; RUN: llc -mtriple=mips64el -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS4
 ; RUN: llc -mtriple=mips64el -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck %s -check-prefixes=ALL,MIPS64R6
 
-; Check that [ls][dwu]xc1 are not emitted for nacl.
-; RUN: llc -mtriple=mipsel-none-nacl-gnu -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=CHECK-NACL
-
 %struct.S = type <{ [4 x float] }>
 %struct.S2 = type <{ [4 x double] }>
 %struct.S3 = type <{ i8, float }>
@@ -43,8 +40,6 @@ entry:
 ; MIPS64R6:      daddu $[[T3:[0-9]+]], $4, $[[T1]]
 ; MIPS64R6:      lwc1 $f0, 0($[[T3]])
 
-; CHECK-NACL-NOT: lwxc1
-
   %arrayidx = getelementptr inbounds float, ptr %b, i32 %o
   %0 = load float, ptr %arrayidx, align 4
   ret float %0
@@ -74,8 +69,6 @@ entry:
 ; MIPS64R6:      daddu $[[T3:[0-9]+]], $4, $[[T1]]
 ; MIPS64R6:      ldc1 $f0, 0($[[T3]])
 
-; CHECK-NACL-NOT: ldxc1
-
   %arrayidx = getelementptr inbounds double, ptr %b, i32 %o
   %0 = load double, ptr %arrayidx, align 8
   ret double %0
@@ -127,8 +120,6 @@ entry:
 ; MIPS64R6-DAG:  daddu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
 ; MIPS64R6-DAG:  swc1 $[[T0]], 0($[[T1]])
 
-; CHECK-NACL-NOT: swxc1
-
   %0 = load float, ptr @gf, align 4
   %arrayidx = getelementptr inbounds float, ptr %b, i32 %o
   store float %0, ptr %arrayidx, align 4
@@ -157,8 +148,6 @@ entry:
 ; MIPS64R6-DAG:  daddu $[[T1:[0-9]+]], $4, ${{[0-9]+}}
 ; MIPS64R6-DAG:  sdc1 $[[T0]], 0($[[T1]])
 
-; CHECK-NACL-NOT: sdxc1
-
   %0 = load double, ptr @gd, align 8
   %arrayidx = getelementptr inbounds double, ptr %b, i32 %o
   store double %0, ptr %arrayidx, align 8
diff --git a/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll b/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll
index df15658b54f52..7d5d08cfa96e5 100644
--- a/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll
+++ b/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;       Except for the NACL version which isn't parsed by update_llc_test_checks.py
 
 ; RUN: llc -mtriple=mipsel-unknown-linux-gnu -force-mips-long-branch -O3 \
 ; RUN:   -mcpu=mips32r2 -mattr=+use-indirect-jump-hazard -relocation-model=pic \
diff --git a/llvm/test/CodeGen/Mips/ldexp.ll b/llvm/test/CodeGen/Mips/ldexp.ll
index 8d266e2fb7017..5014c46b87939 100644
--- a/llvm/test/CodeGen/Mips/ldexp.ll
+++ b/llvm/test/CodeGen/Mips/ldexp.ll
@@ -125,6 +125,21 @@ define half @ldexp_f16(half %arg0, i32 %arg1) nounwind {
   ret half %ldexp
 }
 
+define fp128 @ldexp_f128(fp128 %arg0, i32 %arg1) nounwind {
+; SOFT-LABEL: ldexp_f128:
+; SOFT:       # %bb.0:
+; SOFT-NEXT:    addiu $sp, $sp, -32
+; SOFT-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-NEXT:    lw $1, 48($sp)
+; SOFT-NEXT:    jal ldexpl
+; SOFT-NEXT:    sw $1, 16($sp)
+; SOFT-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-NEXT:    jr $ra
+; SOFT-NEXT:    addiu $sp, $sp, 32
+  %ldexp = call fp128 @llvm.ldexp.f128.i32(fp128 %arg0, i32 %arg1)
+  ret fp128 %ldexp
+}
+
 declare double @llvm.ldexp.f64.i32(double, i32) #0
 declare float @llvm.ldexp.f32.i32(float, i32) #0
 declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>) #0
diff --git a/llvm/test/CodeGen/Mips/longbranch.ll b/llvm/test/CodeGen/Mips/longbranch.ll
index 66ee3859ae448..3e9be39179db7 100644
--- a/llvm/test/CodeGen/Mips/longbranch.ll
+++ b/llvm/test/CodeGen/Mips/longbranch.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-;       Except for the NACL version which isn't parsed by update_llc_test_checks.py
 ; RUN: llc -mtriple=mipsel-unknown-linux-gnu -O3 -relocation-model=pic < %s \
 ; RUN:   | FileCheck %s -check-prefix=NOLONGBRANCH
 
@@ -27,11 +26,6 @@
 ; RUN: llc -mtriple=mipsel-unknown-linux-gnu -mcpu=mips32r6 -mattr=micromips \
 ; RUN:   -force-mips-long-branch -O3 -relocation-model=pic < %s | FileCheck %s -check-prefix=MICROMIPSR6PIC
 
-
-; RUN: llc -mtriple=mipsel-none-nacl -force-mips-long-branch -O3 -relocation-model=pic < %s \
-; RUN:   | FileCheck %s -check-prefix=NACL
-
-
 @x = external global i32
 
 define void @test1(i32 signext %s) {
@@ -276,38 +270,6 @@ define void @test1(i32 signext %s) {
 ; MICROMIPSR6PIC-NEXT:  $BB0_4: # %end
 ; MICROMIPSR6PIC-NEXT:    jrc $ra
 
-; NACL-LABEL: test1:
-; NACL:       # %bb.0:
-; NACL-NEXT:    lui $2, %hi(_gp_disp)
-; NACL-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; NACL-NEXT:    bnez  $4, $BB0_3
-; NACL-NEXT:    addu  $2, $2, $25
-; NACL-NEXT:  # %bb.1:
-; NACL-NEXT:    addiu $sp, $sp, -8
-; NACL-NEXT:    sw  $ra, 0($sp)
-; NACL-NEXT:    lui $1, %hi($BB0_4-$BB0_2)
-; NACL-NEXT:    bal $BB0_2
-; NACL-NEXT:    addiu $1, $1, %lo($BB0_4-$BB0_2)
-; NACL-NEXT:  $BB0_2:
-; NACL-NEXT:    addu  $1, $ra, $1
-; NACL-NEXT:    lw  $ra, 0($sp)
-; NACL-NEXT:    addiu $sp, $sp, 8
-; NACL-NEXT:    jr  $1
-; NACL-NEXT:    nop
-; NACL-NEXT:  $BB0_3:
-; NACL-NEXT:    lw  $1, %got(x)($2)
-; NACL-NEXT:    addiu $2, $zero, 1
-; NACL-NEXT:    sw  $2, 0($1)
-; NACL-NEXT:    .p2align  4
-; NACL-NEXT:  $BB0_4:
-; NACL-NEXT:    jr  $ra
-; NACL-NEXT:    nop
-
-
-; Check the NaCl version.  Check that sp change is not in the branch delay slot
-; of "jr $1" instruction.  Check that target of indirect branch "jr $1" is
-; bundle aligned.
-
 entry:
   %cmp = icmp eq i32 %s, 0
   br i1 %cmp, label %end, label %then
diff --git a/llvm/test/CodeGen/Mips/nacl-align.ll b/llvm/test/CodeGen/Mips/nacl-align.ll
deleted file mode 100644
index 668b7a21e218a..0000000000000
--- a/llvm/test/CodeGen/Mips/nacl-align.ll
+++ /dev/null
@@ -1,99 +0,0 @@
-; RUN: llc -filetype=asm -mtriple=mipsel-none-nacl -relocation-model=static \
-; RUN:     -O3 < %s | FileCheck %s
-
-
-; This test tests that NaCl functions are bundle-aligned.
-
-define void @test0() {
-  ret void
-
-; CHECK:          .p2align  4
-; CHECK-NOT:      .p2align
-; CHECK-LABEL:    test0:
-
-}
-
-
-; This test tests that blocks that are jumped to through jump table are
-; bundle-aligned.
-
-define i32 @test1(i32 %i) {
-entry:
-  switch i32 %i, label %default [
-    i32 0, label %bb1
-    i32 1, label %bb2
-    i32 2, label %bb3
-    i32 3, label %bb4
-  ]
-
-bb1:
-  ret i32 111
-bb2:
-  ret i32 222
-bb3:
-  ret i32 333
-bb4:
-  ret i32 444
-default:
-  ret i32 555
-
-
-; CHECK-LABEL:       test1:
-
-; CHECK:             .p2align  4
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 111
-; CHECK-NEXT:        .p2align  4
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 333
-; CHECK-NEXT:        .p2align  4
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 444
-; CHECK-NEXT:        .p2align  4
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 222
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 555
-
-}
-
-
-; This test tests that a block whose address is taken is bundle-aligned in NaCl.
-
-@bb_array = constant [2 x ptr] [ptr blockaddress(@test2, %bb1),
-                                ptr blockaddress(@test2, %bb2)], align 4
-
-define i32 @test2(i32 %i) {
-entry:
-  %elementptr = getelementptr inbounds [2 x ptr], ptr @bb_array, i32 0, i32 %i
-  %0 = load ptr, ptr %elementptr, align 4
-  indirectbr ptr %0, [label %bb1, label %bb2]
-
-bb1:
-  ret i32 111
-bb2:
-  ret i32 222
-
-
-; CHECK-LABEL:       test2:
-
-; Note that there are two consecutive labels - one temporary and one for
-; basic block.
-
-; CHECK:             .p2align  4
-; CHECK-NEXT:    ${{[a-zA-Z0-9]+}}:
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 111
-; CHECK-NEXT:        .p2align  4
-; CHECK-NEXT:    ${{[a-zA-Z0-9]+}}:
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 222
-
-}
diff --git a/llvm/test/CodeGen/Mips/nacl-branch-delay.ll b/llvm/test/CodeGen/Mips/nacl-branch-delay.ll
deleted file mode 100644
index 38348d8d49ec4..0000000000000
--- a/llvm/test/CodeGen/Mips/nacl-branch-delay.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -filetype=asm -mtriple=mipsel-none-linux -relocation-model=static \
-; RUN:     -O3 < %s | FileCheck %s
-
-; RUN: llc -filetype=asm -mtriple=mipsel-none-nacl -relocation-model=static \
-; RUN:     -O3 < %s | FileCheck %s -check-prefix=CHECK-NACL
-
-@x = global i32 0, align 4
-declare void @f1(i32)
-declare void @f2()
-
-
-define void @test1() {
-  %1 = load i32, ptr @x, align 4
-  call void @f1(i32 %1)
-  ret void
-
-
-; CHECK-LABEL:       test1
-
-; We first make sure that for non-NaCl targets branch-delay slot contains
-; dangerous instructions.
-
-; Check that branch-delay slot is used to load argument from x before function
-; call.
-
-; CHECK:             jal
-; CHECK-NEXT:        lw      $4, %lo(x)(${{[0-9]+}})
-
-; Check that branch-delay slot is used for adjusting sp before return.
-
-; CHECK:             jr      $ra
-; CHECK-NEXT:        addiu   $sp, $sp, {{[0-9]+}}
-
-
-; For NaCl, check that branch-delay slot doesn't contain dangerous instructions.
-
-; CHECK-NACL:             jal
-; CHECK-NACL-NEXT:        nop
-
-; CHECK-NACL:             jr      $ra
-; CHECK-NACL-NEXT:        nop
-}
-
-
-define void @test2() {
-  store i32 1, ptr @x, align 4
-  call void @f2()
-  ret void
-
-
-; CHECK-LABEL:       test2
-
-; Check that branch-delay slot is used for storing to x before function call.
-
-; CHECK:             jal
-; CHECK-NEXT:        sw      ${{[0-9]+}}, %lo(x)(${{[0-9]+}})
-
-; Check that branch-delay slot is used for adjusting sp before return.
-
-; CHECK:             jr      $ra
-; CHECK-NEXT:        addiu   $sp, $sp, {{[0-9]+}}
-
-
-; For NaCl, check that branch-delay slot doesn't contain dangerous instructions.
-
-; CHECK-NACL:             jal
-; CHECK-NACL-NEXT:        nop
-
-; CHECK-NACL:             jr      $ra
-; CHECK-NACL-NEXT:        nop
-}
diff --git a/llvm/test/CodeGen/Mips/nacl-reserved-regs.ll b/llvm/test/CodeGen/Mips/nacl-reserved-regs.ll
deleted file mode 100644
index b03418dff6ccc..0000000000000
--- a/llvm/test/CodeGen/Mips/nacl-reserved-regs.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc -mtriple=mipsel -O3 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel-none-nacl-gnu -O3 < %s \
-; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
-
-@var = external global i32
-
-define void @f() {
-  %val1 = load volatile i32, ptr @var
-  %val2 = load volatile i32, ptr @var
-  %val3 = load volatile i32, ptr @var
-  %val4 = load volatile i32, ptr @var
-  %val5 = load volatile i32, ptr @var
-  %val6 = load volatile i32, ptr @var
-  %val7 = load volatile i32, ptr @var
-  %val8 = load volatile i32, ptr @var
-  %val9 = load volatile i32, ptr @var
-  %val10 = load volatile i32, ptr @var
-  %val11 = load volatile i32, ptr @var
-  %val12 = load volatile i32, ptr @var
-  %val13 = load volatile i32, ptr @var
-  %val14 = load volatile i32, ptr @var
-  %val15 = load volatile i32, ptr @var
-  %val16 = load volatile i32, ptr @var
-  store volatile i32 %val1, ptr @var
-  store volatile i32 %val2, ptr @var
-  store volatile i32 %val3, ptr @var
-  store volatile i32 %val4, ptr @var
-  store volatile i32 %val5, ptr @var
-  store volatile i32 %val6, ptr @var
-  store volatile i32 %val7, ptr @var
-  store volatile i32 %val8, ptr @var
-  store volatile i32 %val9, ptr @var
-  store volatile i32 %val10, ptr @var
-  store volatile i32 %val11, ptr @var
-  store volatile i32 %val12, ptr @var
-  store volatile i32 %val13, ptr @var
-  store volatile i32 %val14, ptr @var
-  store volatile i32 %val15, ptr @var
-  store volatile i32 %val16, ptr @var
-  ret void
-
-; Check that t6, t7 and t8 are used in non-NaCl code.
-; CHECK:    lw  $14
-; CHECK:    lw  $15
-; CHECK:    lw  $24
-
-; t6, t7 and t8 are reserved in NaCl.
-; CHECK-NACL-NOT:    lw  $14
-; CHECK-NACL-NOT:    lw  $15
-; CHECK-NACL-NOT:    lw  $24
-}
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll
index c7a0c60ae1f4d..94b3f0a2e1c3e 100644
--- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll
@@ -93,7 +93,8 @@ entry:
   %3 = atomicrmw or ptr %0, i8 %1 monotonic, align 1
   ; ALL: atom.xor.b32
   %4 = atomicrmw xor ptr %0, i8 %1 monotonic, align 1
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %5 = atomicrmw xchg ptr %0, i8 %1 monotonic, align 1
   ret void
 }
@@ -101,13 +102,17 @@ entry:
 ; CHECK-LABEL: minmax_i8
 define void @minmax_i8(ptr %0, i8 %1) {
 entry:
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %2 = atomicrmw min ptr %0, i8 %1 monotonic, align 1
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %3 = atomicrmw max ptr %0, i8 %1 monotonic, align 1
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %4 = atomicrmw umin ptr %0, i8 %1 monotonic, align 1
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %5 = atomicrmw umax ptr %0, i8 %1 monotonic, align 1
   ret void
 }
@@ -121,7 +126,8 @@ entry:
   %3 = atomicrmw or ptr %0, i16 %1 monotonic, align 2
   ; ALL: atom.xor.b32
   %4 = atomicrmw xor ptr %0, i16 %1 monotonic, align 2
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %5 = atomicrmw xchg ptr %0, i16 %1 monotonic, align 2
   ret void
 }
@@ -129,13 +135,17 @@ entry:
 ; CHECK-LABEL: minmax_i16
 define void @minmax_i16(ptr %0, i16 %1) {
 entry:
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %2 = atomicrmw min ptr %0, i16 %1 monotonic, align 2
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %3 = atomicrmw max ptr %0, i16 %1 monotonic, align 2
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %4 = atomicrmw umin ptr %0, i16 %1 monotonic, align 2
-  ; ALL: atom.cas.b32
+  ; SM30: atom.cas.b32
+  ; SM60: atom.sys.cas.b32
   %5 = atomicrmw umax ptr %0, i16 %1 monotonic, align 2
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index 94f49b01e6ea6..f710d7f883a1b 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -70,7 +70,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    shl.b32 %r30, %r29, %r2;
 ; CHECKPTX62-NEXT:    and.b32 %r31, %r54, %r3;
 ; CHECKPTX62-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX62-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
 ; CHECKPTX62-NEXT:    setp.ne.b32 %p1, %r6, %r54;
 ; CHECKPTX62-NEXT:    mov.b32 %r54, %r6;
 ; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
@@ -86,7 +86,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r2;
 ; CHECKPTX62-NEXT:    and.b32 %r36, %r55, %r3;
 ; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX62-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
 ; CHECKPTX62-NEXT:    setp.ne.b32 %p2, %r9, %r55;
 ; CHECKPTX62-NEXT:    mov.b32 %r55, %r9;
 ; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
@@ -107,7 +107,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
 ; CHECKPTX62-NEXT:    and.b32 %r44, %r56, %r12;
 ; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX62-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
 ; CHECKPTX62-NEXT:    setp.ne.b32 %p3, %r15, %r56;
 ; CHECKPTX62-NEXT:    mov.b32 %r56, %r15;
 ; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
@@ -128,7 +128,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    shl.b32 %r51, %r50, %r17;
 ; CHECKPTX62-NEXT:    and.b32 %r52, %r57, %r18;
 ; CHECKPTX62-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX62-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX62-NEXT:    atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
 ; CHECKPTX62-NEXT:    setp.ne.b32 %p4, %r21, %r57;
 ; CHECKPTX62-NEXT:    mov.b32 %r57, %r21;
 ; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index b21bd16d55c2c..f96fd30019025 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
 ; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT:    atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
 ; CHECKPTX71-NEXT:    setp.ne.b32 %p1, %r6, %r54;
 ; CHECKPTX71-NEXT:    mov.b32 %r54, %r6;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
@@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
 ; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
 ; CHECKPTX71-NEXT:    setp.ne.b32 %p2, %r9, %r55;
 ; CHECKPTX71-NEXT:    mov.b32 %r55, %r9;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
@@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
 ; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
 ; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
 ; CHECKPTX71-NEXT:    setp.ne.b32 %p3, %r15, %r56;
 ; CHECKPTX71-NEXT:    mov.b32 %r56, %r15;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
@@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
 ; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
 ; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT:    atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT:    atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
 ; CHECKPTX71-NEXT:    setp.ne.b32 %p4, %r21, %r57;
 ; CHECKPTX71-NEXT:    mov.b32 %r57, %r21;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 9f900c961d2ed..63c389c36e87e 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
 
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB0_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB1_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB1_1;
 ; SM60-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB2_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB2_1;
 ; SM60-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -146,10 +149,10 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -160,13 +163,13 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB3_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +179,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB3_1;
 ; SM60-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_global(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -192,10 +195,10 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -212,7 +215,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB4_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -222,15 +225,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB4_1;
 ; SM60-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_shared(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -238,10 +241,11 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -252,13 +256,13 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB5_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +272,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB5_1;
 ; SM60-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -284,10 +288,10 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -299,13 +303,13 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB6_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +319,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB6_1;
 ; SM60-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_global(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -331,10 +334,10 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -352,7 +355,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB7_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +365,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB7_1;
 ; SM60-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -378,10 +381,10 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -393,13 +396,13 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB8_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -409,15 +412,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB8_1;
 ; SM60-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_generic(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -425,10 +428,11 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -439,13 +443,13 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB9_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -455,15 +459,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB9_1;
 ; SM60-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_global(
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -471,10 +475,11 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -491,7 +496,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB10_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +506,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB10_1;
 ; SM60-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_shared(
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -517,10 +522,11 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -531,13 +537,13 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB11_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +553,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB11_1;
 ; SM60-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_generic(
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -563,10 +569,11 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -577,13 +584,13 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB12_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -593,15 +600,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB12_1;
 ; SM60-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_global(
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -609,10 +616,11 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -629,7 +637,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB13_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -639,15 +647,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB13_1;
 ; SM60-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_shared(
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
@@ -655,10 +663,11 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -669,13 +678,13 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    and.b32 %r15, %r14, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM60-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM60-NEXT:    @%p1 bra $L__BB14_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -685,4996 +694,1429 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB14_1;
 ; SM60-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_generic(
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB15_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB15_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB15_1;
 ; SM60-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
 }
 
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_global(
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB16_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB16_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB16_1;
 ; SM60-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
 }
 
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_shared(
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB17_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB17_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB17_1;
 ; SM60-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_generic(
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB18_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB18_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB18_1;
 ; SM60-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_global(
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB19_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB19_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB19_1;
 ; SM60-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_shared(
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB20_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB20_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB20_1;
 ; SM60-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_generic(
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB21_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB21_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB21_1;
 ; SM60-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_global(
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB22_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB22_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB22_1;
 ; SM60-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_shared(
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB23_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB23_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB23_1;
 ; SM60-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_generic(
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB24_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB24_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB24_1;
 ; SM60-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_global(
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB25_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB25_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB25_1;
 ; SM60-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_shared(
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB26_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB26_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB26_1;
 ; SM60-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_generic(
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB27_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB27_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB27_1;
 ; SM60-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_global(
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB28_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB28_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB28_1;
 ; SM60-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_shared(
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b32 %r<20>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    mov.b32 %r12, 65535;
 ; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r13;
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM60-NEXT:    @%p1 bra $L__BB29_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB29_1 Depth=1
 ; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.b32 %r19, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB29_1;
 ; SM60-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    membar.cta;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
 }
 
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_generic(
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB30_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB30_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB30_1;
-; SM60-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
 }
 
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_global(
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB31_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB31_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB31_1;
-; SM60-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
 }
 
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_shared(
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB32_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB32_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB32_1;
-; SM60-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
-}
-
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB33_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB33_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB33_1;
-; SM60-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB34_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB34_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB34_1;
-; SM60-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB35_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB35_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB35_1;
-; SM60-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB36_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB36_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB36_1;
-; SM60-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB37_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB37_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB37_1;
-; SM60-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB38_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB38_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB38_1;
-; SM60-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB39_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB39_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB39_1;
-; SM60-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB40_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB40_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB40_1;
-; SM60-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB41_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB41_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB41_1;
-; SM60-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB42_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB42_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB42_1;
-; SM60-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB43_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB43_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB43_1;
-; SM60-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<21>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 255;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    and.b32 %r15, %r14, 255;
-; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM60-NEXT:    and.b32 %r20, %r16, %r2;
-; SM60-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r17, %r20, %r3;
-; SM60-NEXT:    or.b32 %r18, %r20, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM60-NEXT:    @%p1 bra $L__BB44_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB44_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM60-NEXT:    mov.b32 %r20, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB44_1;
-; SM60-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB45_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB45_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB45_1;
-; SM60-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB46_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB46_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB46_1;
-; SM60-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB47_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB47_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB47_1;
-; SM60-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB48_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB48_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB48_1;
-; SM60-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB49_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB49_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB49_1;
-; SM60-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB50_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB50_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB50_1;
-; SM60-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB51_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB51_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB51_1;
-; SM60-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB52_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB52_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB52_1;
-; SM60-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB53_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB53_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB53_1;
-; SM60-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB54_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB54_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB54_1;
-; SM60-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB55_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB55_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB55_1;
-; SM60-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB56_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB56_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB56_1;
-; SM60-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB57_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB57_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB57_1;
-; SM60-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB58_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB58_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB58_1;
-; SM60-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB59_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB59_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB59_1;
-; SM60-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB60_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB60_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB60_1;
-; SM60-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB61_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB61_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB61_1;
-; SM60-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB62_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB62_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB62_1;
-; SM60-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB63_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB63_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB63_1;
-; SM60-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB64_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB64_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB64_1;
-; SM60-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB65_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB65_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB65_1;
-; SM60-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB66_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB66_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB66_1;
-; SM60-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB67_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB67_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB67_1;
-; SM60-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB68_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB68_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB68_1;
-; SM60-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB69_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB69_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB69_1;
-; SM60-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB70_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB70_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB70_1;
-; SM60-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB71_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB71_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB71_1;
-; SM60-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB72_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB72_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB72_1;
-; SM60-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB73_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB73_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB73_1;
-; SM60-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB74_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB74_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB74_1;
-; SM60-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB75_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB75_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB75_1;
-; SM60-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB76_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB76_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB76_1;
-; SM60-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB77_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB77_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB77_1;
-; SM60-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB78_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB78_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB78_1;
-; SM60-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB79_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB79_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB79_1;
-; SM60-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB80_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB80_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB80_1;
-; SM60-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB81_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB81_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB81_1;
-; SM60-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB82_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB82_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB82_1;
-; SM60-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB83_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB83_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB83_1;
-; SM60-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB84_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB84_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB84_1;
-; SM60-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB85_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB85_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB85_1;
-; SM60-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB86_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB86_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB86_1;
-; SM60-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB87_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB87_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB87_1;
-; SM60-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_global(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB88_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB88_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB88_1;
-; SM60-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .pred %p<3>;
-; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<20>;
-; SM60-NEXT:    .reg .b64 %rd<3>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
-; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT:    and.b32 %r11, %r10, 3;
-; SM60-NEXT:    shl.b32 %r1, %r11, 3;
-; SM60-NEXT:    mov.b32 %r12, 65535;
-; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM60-NEXT:    not.b32 %r2, %r13;
-; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM60-NEXT:    and.b32 %r19, %r15, %r2;
-; SM60-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
-; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r16, %r19, %r3;
-; SM60-NEXT:    or.b32 %r17, %r19, %r4;
-; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM60-NEXT:    @%p1 bra $L__BB89_3;
-; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM60-NEXT:    // in Loop: Header=BB89_1 Depth=1
-; SM60-NEXT:    and.b32 %r8, %r7, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM60-NEXT:    mov.b32 %r19, %r8;
-; SM60-NEXT:    @%p2 bra $L__BB89_1;
-; SM60-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b32 %r<4>;
-; SM60-NEXT:    .reg .b64 %rd<2>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
-}
-
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_global(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
-}
-
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_shared(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
-}
-
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_generic(
-; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
-; SM60-EMPTY:
-; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_global(
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_shared(
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_generic(
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_global(
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_shared(
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_generic(
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_global(
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_shared(
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_generic(
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_global(
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_shared(
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
 }
 
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_generic(
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
 }
 
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_global(
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_shared(
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_generic(
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_global(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_shared(
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_global(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_generic(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_global(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_shared(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM60:       {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT:    atom.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_global(
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB60_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB60_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB60_1;
+; SM60-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_shared(
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM60-NEXT:    atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_generic(
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_sys(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM60-NEXT:    atom.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_global(
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_gpu(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM60-NEXT:    atom.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_shared(
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB64_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB64_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB64_1;
+; SM60-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_generic(
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB65_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB65_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.b32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB65_1;
+; SM60-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.cta;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_global(
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM60-NEXT:    atom.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_shared(
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_cta(
 ; SM60:       {
-; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
-; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM60-NEXT:    atom.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
index 28b258dc2a868..5cb344d5ded84 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
 
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB1_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB1_1;
 ; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB2_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB2_1;
 ; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -146,10 +149,10 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -160,13 +163,13 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB3_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +179,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB3_1;
 ; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_global(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -192,10 +195,10 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -212,7 +215,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB4_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -222,15 +225,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB4_1;
 ; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_shared(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -238,10 +241,11 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -252,13 +256,13 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB5_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +272,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB5_1;
 ; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -284,10 +288,10 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -299,13 +303,13 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB6_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +319,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB6_1;
 ; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_global(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -331,10 +334,10 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -352,7 +355,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB7_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +365,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB7_1;
 ; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -378,10 +381,10 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -393,13 +396,13 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -409,15 +412,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB8_1;
 ; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_generic(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -425,10 +428,11 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -439,13 +443,13 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -455,15 +459,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB9_1;
 ; SM70-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_global(
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -471,10 +475,11 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -491,7 +496,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB10_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +506,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB10_1;
 ; SM70-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_shared(
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -517,10 +522,11 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -531,13 +537,13 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB11_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +553,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB11_1;
 ; SM70-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_generic(
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -563,10 +569,11 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -577,13 +584,13 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB12_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -593,15 +600,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB12_1;
 ; SM70-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_global(
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -609,10 +616,11 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -629,7 +637,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB13_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -639,15 +647,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB13_1;
 ; SM70-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_shared(
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -655,10 +663,11 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -669,13 +678,13 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    and.b32 %r15, %r14, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB14_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -685,4996 +694,1429 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB14_1;
 ; SM70-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_generic(
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB15_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB15_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB15_1;
 ; SM70-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
 }
 
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_global(
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB16_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB16_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB16_1;
 ; SM70-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
 }
 
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_shared(
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB17_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB17_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB17_1;
 ; SM70-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_generic(
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB18_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB18_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB18_1;
 ; SM70-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_global(
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB19_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB19_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB19_1;
 ; SM70-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_shared(
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB20_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB20_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB20_1;
 ; SM70-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_generic(
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB21_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB21_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB21_1;
 ; SM70-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_global(
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB22_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB22_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB22_1;
 ; SM70-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_shared(
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB23_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB23_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB23_1;
 ; SM70-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_generic(
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB24_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB24_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB24_1;
 ; SM70-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_global(
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB25_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB25_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB25_1;
 ; SM70-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_shared(
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB26_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB26_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB26_1;
 ; SM70-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_generic(
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB27_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB27_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB27_1;
 ; SM70-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_global(
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB28_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB28_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB28_1;
 ; SM70-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_shared(
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    mov.b32 %r12, 65535;
 ; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM70-NEXT:    not.b32 %r2, %r13;
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB29_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB29_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.b32 %r19, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB29_1;
 ; SM70-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
-}
-
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB30_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB30_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB30_1;
-; SM70-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
-}
-
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB31_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB31_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB31_1;
-; SM70-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
-}
-
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB32_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB32_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB32_1;
-; SM70-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    fence.acq_rel.cta;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB33_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB33_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB33_1;
-; SM70-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB34_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB34_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB34_1;
-; SM70-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB35_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB35_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB35_1;
-; SM70-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB36_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB36_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB36_1;
-; SM70-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB37_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB37_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB37_1;
-; SM70-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB38_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB38_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB38_1;
-; SM70-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB39_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB39_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB39_1;
-; SM70-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB40_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB40_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB40_1;
-; SM70-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB41_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB41_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB41_1;
-; SM70-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB42_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB42_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB42_1;
-; SM70-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB43_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB43_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB43_1;
-; SM70-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 255;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    and.b32 %r15, %r14, 255;
-; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB44_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB44_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.b32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB44_1;
-; SM70-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB45_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB45_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB45_1;
-; SM70-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB46_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB46_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB46_1;
-; SM70-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB47_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB47_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB47_1;
-; SM70-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB48_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB48_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB48_1;
-; SM70-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB49_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB49_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB49_1;
-; SM70-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB50_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB50_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB50_1;
-; SM70-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB51_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB51_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB51_1;
-; SM70-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB52_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB52_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB52_1;
-; SM70-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB53_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB53_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB53_1;
-; SM70-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB54_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB54_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB54_1;
-; SM70-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB55_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB55_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB55_1;
-; SM70-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB56_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB56_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB56_1;
-; SM70-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB57_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB57_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB57_1;
-; SM70-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB58_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB58_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB58_1;
-; SM70-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB59_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB59_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB59_1;
-; SM70-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB60_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB60_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB60_1;
-; SM70-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB61_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB61_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB61_1;
-; SM70-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB62_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB62_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB62_1;
-; SM70-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB63_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB63_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB63_1;
-; SM70-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB64_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB64_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB64_1;
-; SM70-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB65_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB65_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB65_1;
-; SM70-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB66_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB66_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB66_1;
-; SM70-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB67_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB67_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB67_1;
-; SM70-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB68_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB68_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB68_1;
-; SM70-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB69_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB69_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB69_1;
-; SM70-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB70_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB70_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB70_1;
-; SM70-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB71_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB71_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB71_1;
-; SM70-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB72_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB72_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB72_1;
-; SM70-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB73_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB73_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB73_1;
-; SM70-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB74_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB74_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB74_1;
-; SM70-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB75_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB75_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB75_1;
-; SM70-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB76_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB76_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB76_1;
-; SM70-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB77_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB77_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB77_1;
-; SM70-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB78_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB78_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB78_1;
-; SM70-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB79_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB79_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB79_1;
-; SM70-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB80_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB80_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB80_1;
-; SM70-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB81_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB81_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB81_1;
-; SM70-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB82_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB82_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB82_1;
-; SM70-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB83_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB83_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB83_1;
-; SM70-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB84_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB84_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB84_1;
-; SM70-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB85_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB85_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB85_1;
-; SM70-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB86_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB86_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB86_1;
-; SM70-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB87_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB87_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB87_1;
-; SM70-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_global(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB88_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB88_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB88_1;
-; SM70-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<20>;
-; SM70-NEXT:    .reg .b64 %rd<3>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT:    and.b32 %r11, %r10, 3;
-; SM70-NEXT:    shl.b32 %r1, %r11, 3;
-; SM70-NEXT:    mov.b32 %r12, 65535;
-; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM70-NEXT:    not.b32 %r2, %r13;
-; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
-; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r16, %r19, %r3;
-; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB89_3;
-; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB89_1 Depth=1
-; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM70-NEXT:    mov.b32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB89_1;
-; SM70-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b32 %r<4>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_shared(
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b32 %r<4>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_shared(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_generic(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
-}
-
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_global(
-; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
-; SM70-EMPTY:
-; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
 }
 
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_shared(
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_generic(
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_global(
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_shared(
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_generic(
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_global(
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_shared(
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_generic(
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_global(
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_shared(
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_generic(
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_global(
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_shared(
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
 }
 
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_generic(
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
 }
 
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_global(
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_shared(
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_generic(
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_global(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_shared(
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_global(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_generic(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_global(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_shared(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM70:       {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_global(
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB60_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB60_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB60_1;
+; SM70-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_shared(
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_generic(
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_sys(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM70-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_global(
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_gpu(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM70-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_shared(
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB64_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB64_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB64_1;
+; SM70-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_generic(
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB65_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB65_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.b32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB65_1;
+; SM70-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_global(
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_shared(
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_cta(
 ; SM70:       {
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
-; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM70-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
index 368fe3f036c9e..7cb259023d6dd 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
 
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -11,10 +11,10 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -25,13 +25,13 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB0_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -56,10 +56,10 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -76,7 +76,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB1_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -86,14 +86,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB1_1;
 ; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
     ret i8 %new
 }
 
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -101,10 +102,11 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -115,13 +117,13 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB2_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -131,14 +133,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB2_1;
 ; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -146,10 +149,10 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -160,13 +163,13 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB3_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +179,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB3_1;
 ; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_global(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -192,10 +195,10 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -212,7 +215,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB4_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -222,15 +225,15 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB4_1;
 ; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
     ret i8 %new
 }
 
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_shared(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -238,10 +241,11 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -252,13 +256,13 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB5_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +272,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB5_1;
 ; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -284,10 +288,10 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -299,13 +303,13 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB6_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +319,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB6_1;
 ; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_global(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -331,10 +334,10 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -352,7 +355,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB7_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +365,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB7_1;
 ; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
     ret i8 %new
 }
 
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -378,10 +381,10 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -393,13 +396,13 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB8_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -409,15 +412,15 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB8_1;
 ; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_generic(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -425,10 +428,11 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -439,13 +443,13 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB9_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -455,15 +459,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB9_1;
 ; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_global(
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -471,10 +475,11 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -491,7 +496,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB10_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +506,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB10_1;
 ; SM90-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
     ret i8 %new
 }
 
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_shared(
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -517,10 +522,11 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -531,13 +537,13 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB11_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +553,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB11_1;
 ; SM90-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_generic(
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -563,10 +569,11 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -577,13 +584,13 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB12_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -593,15 +600,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB12_1;
 ; SM90-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_global(
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -609,10 +616,11 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -629,7 +637,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB13_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -639,15 +647,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB13_1;
 ; SM90-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
     ret i8 %new
 }
 
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_shared(
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -655,10 +663,11 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -669,13 +678,13 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    and.b32 %r15, %r14, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM90-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM90-NEXT:    @%p1 bra $L__BB14_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -685,4996 +694,1446 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB14_1;
 ; SM90-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
     ret i8 %new
 }
 
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_generic(
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB15_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB15_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB15_1;
 ; SM90-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+    ret i16 %new
 }
 
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_global(
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB16_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB16_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB16_1;
 ; SM90-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+    ret i16 %new
 }
 
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_shared(
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB17_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB17_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB17_1;
 ; SM90-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_generic(
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB18_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB18_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB18_1;
 ; SM90-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_global(
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB19_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB19_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB19_1;
 ; SM90-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+    ret i16 %new
 }
 
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_shared(
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB20_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB20_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB20_1;
 ; SM90-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_generic(
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB21_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB21_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB21_1;
 ; SM90-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_global(
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB22_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB22_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB22_1;
 ; SM90-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+    ret i16 %new
 }
 
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_shared(
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB23_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB23_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB23_1;
 ; SM90-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_generic(
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB24_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB24_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB24_1;
 ; SM90-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_global(
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB25_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB25_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB25_1;
 ; SM90-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+    ret i16 %new
 }
 
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_shared(
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB26_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB26_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB26_1;
 ; SM90-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_generic(
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB27_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB27_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB27_1;
 ; SM90-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_global(
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB28_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB28_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB28_1;
 ; SM90-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+    ret i16 %new
 }
 
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_shared(
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<3>;
 ; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b32 %r<20>;
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    mov.b32 %r12, 65535;
 ; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
 ; SM90-NEXT:    not.b32 %r2, %r13;
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM90-NEXT:    @%p1 bra $L__BB29_3;
 ; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM90-NEXT:    // in Loop: Header=BB29_1 Depth=1
 ; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.b32 %r19, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB29_1;
 ; SM90-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
-    ret i8 %new
-}
-
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB30_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB30_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB30_1;
-; SM90-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
-}
-
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB31_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB31_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB31_1;
-; SM90-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    fence.acquire.cta;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+    ret i16 %new
 }
 
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_shared(
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB32_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB32_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB32_1;
-; SM90-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
-    ret i8 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+    ret i32 %new
 }
 
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB33_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB33_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB33_1;
-; SM90-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB34_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB34_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB34_1;
-; SM90-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB35_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB35_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB35_1;
-; SM90-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB36_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB36_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB36_1;
-; SM90-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB37_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB37_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB37_1;
-; SM90-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB38_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB38_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB38_1;
-; SM90-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB39_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB39_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB39_1;
-; SM90-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB40_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB40_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB40_1;
-; SM90-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB41_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB41_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB41_1;
-; SM90-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB42_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB42_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB42_1;
-; SM90-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB43_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB43_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB43_1;
-; SM90-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<21>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 255;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    and.b32 %r15, %r14, 255;
-; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
-; SM90-NEXT:    and.b32 %r20, %r16, %r2;
-; SM90-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r17, %r20, %r3;
-; SM90-NEXT:    or.b32 %r18, %r20, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
-; SM90-NEXT:    @%p1 bra $L__BB44_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB44_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
-; SM90-NEXT:    mov.b32 %r20, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB44_1;
-; SM90-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
-    ret i8 %new
-}
-
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB45_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB45_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB45_1;
-; SM90-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB46_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB46_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB46_1;
-; SM90-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB47_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB47_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB47_1;
-; SM90-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB48_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB48_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB48_1;
-; SM90-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB49_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB49_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB49_1;
-; SM90-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB50_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB50_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB50_1;
-; SM90-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB51_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB51_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB51_1;
-; SM90-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB52_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB52_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB52_1;
-; SM90-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB53_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB53_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB53_1;
-; SM90-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB54_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB54_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB54_1;
-; SM90-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB55_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB55_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB55_1;
-; SM90-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB56_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB56_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB56_1;
-; SM90-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB57_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB57_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB57_1;
-; SM90-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB58_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB58_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB58_1;
-; SM90-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB59_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB59_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB59_1;
-; SM90-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB60_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB60_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB60_1;
-; SM90-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB61_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB61_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB61_1;
-; SM90-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB62_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB62_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB62_1;
-; SM90-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB63_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB63_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB63_1;
-; SM90-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB64_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB64_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB64_1;
-; SM90-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB65_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB65_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB65_1;
-; SM90-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB66_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB66_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB66_1;
-; SM90-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB67_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB67_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB67_1;
-; SM90-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB68_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB68_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB68_1;
-; SM90-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB69_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB69_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB69_1;
-; SM90-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB70_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB70_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB70_1;
-; SM90-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB71_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB71_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB71_1;
-; SM90-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB72_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB72_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB72_1;
-; SM90-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB73_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB73_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB73_1;
-; SM90-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB74_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB74_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB74_1;
-; SM90-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB75_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB75_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB75_1;
-; SM90-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB76_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB76_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB76_1;
-; SM90-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB77_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB77_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB77_1;
-; SM90-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB78_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB78_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB78_1;
-; SM90-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB79_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB79_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB79_1;
-; SM90-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB80_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB80_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB80_1;
-; SM90-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB81_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB81_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB81_1;
-; SM90-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB82_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB82_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB82_1;
-; SM90-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB83_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB83_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB83_1;
-; SM90-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB84_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB84_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB84_1;
-; SM90-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB85_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB85_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB85_1;
-; SM90-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB86_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB86_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB86_1;
-; SM90-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB87_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB87_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB87_1;
-; SM90-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_global(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB88_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB88_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB88_1;
-; SM90-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .pred %p<3>;
-; SM90-NEXT:    .reg .b16 %rs<2>;
-; SM90-NEXT:    .reg .b32 %r<20>;
-; SM90-NEXT:    .reg .b64 %rd<3>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
-; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT:    and.b32 %r11, %r10, 3;
-; SM90-NEXT:    shl.b32 %r1, %r11, 3;
-; SM90-NEXT:    mov.b32 %r12, 65535;
-; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
-; SM90-NEXT:    not.b32 %r2, %r13;
-; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
-; SM90-NEXT:    and.b32 %r19, %r15, %r2;
-; SM90-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
-; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM90-NEXT:    or.b32 %r16, %r19, %r3;
-; SM90-NEXT:    or.b32 %r17, %r19, %r4;
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r17;
-; SM90-NEXT:    @%p1 bra $L__BB89_3;
-; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT:    // in Loop: Header=BB89_1 Depth=1
-; SM90-NEXT:    and.b32 %r8, %r7, %r2;
-; SM90-NEXT:    setp.ne.b32 %p2, %r19, %r8;
-; SM90-NEXT:    mov.b32 %r19, %r8;
-; SM90-NEXT:    @%p2 bra $L__BB89_1;
-; SM90-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
-; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
-    ret i16 %new
-}
-
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<4>;
-; SM90-NEXT:    .reg .b64 %rd<2>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_shared(
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
-    ret i32 %new
-}
-
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_shared(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
-    ret i64 %new
-}
-
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_generic(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
-; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
-}
-
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_global(
-; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
-; SM90-EMPTY:
-; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+    ret i32 %new
 }
 
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_shared(
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_generic(
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_global(
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+    ret i32 %new
 }
 
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_shared(
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_generic(
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_global(
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+    ret i32 %new
 }
 
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_shared(
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_generic(
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_global(
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_shared(
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_generic(
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_global(
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+    ret i32 %new
 }
 
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_shared(
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+    ret i32 %new
 }
 
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_generic(
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_global(
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
     ret i64 %new
 }
 
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_shared(
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_generic(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_global(
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
     ret i64 %new
 }
 
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_shared(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_global(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_generic(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_global(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_shared(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
     ret i64 %new
 }
 
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_cta(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT:    atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
     ret i64 %new
 }
 
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_generic(
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB60_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB60_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB60_1;
+; SM90-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_global(
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_shared(
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_sys(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM90-NEXT:    atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_generic(
+define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cluster(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_cluster_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_cluster_param_2];
+; SM90-NEXT:    atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_global(
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_gpu(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM90-NEXT:    atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_shared(
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB65_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB65_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB65_1;
+; SM90-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
-    ret i64 %new
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_generic(
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.b32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB66_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB66_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.b32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.b32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB66_1;
+; SM90-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+    ret i8 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_global(
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_shared(
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_cta(
 ; SM90:       {
-; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
-; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM90-NEXT:    atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
-    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
-    ret i64 %new
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+    ret i32 %new
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index 25b4c74086dc1..237e42394ba2f 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -79,7 +79,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -99,8 +99,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -111,9 +111,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [relaxed_sys_i8_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -206,7 +206,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB1_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -227,8 +227,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i8_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -239,9 +239,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_sys_i8_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -336,7 +336,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB2_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -356,8 +356,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [release_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i8_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -369,9 +369,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_sys_i8_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [release_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -466,7 +466,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB3_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -487,8 +487,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -500,9 +500,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -598,7 +598,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r17, %r20, %r3;
 ; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB4_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -619,8 +619,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -632,9 +632,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.b32 %r16, [%rd1];
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -726,7 +726,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB5_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -746,10 +746,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
+; SM90-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [relaxed_sys_i16_param_1];
+; SM90-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -759,7 +759,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -850,7 +850,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB6_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -871,10 +871,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acquire_sys_i16_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_sys_i16_param_0];
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i16_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.b16 %r9, [acquire_sys_i16_param_1];
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_sys_i16_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -884,7 +884,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -977,7 +977,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB7_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -997,10 +997,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [release_sys_i16_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_sys_i16_param_0];
+; SM90-NEXT:    ld.param.u16 %rs1, [release_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i16_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [release_sys_i16_param_1];
+; SM90-NEXT:    ld.param.u16 %r9, [release_sys_i16_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -1011,7 +1011,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1104,7 +1104,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1125,10 +1125,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -1139,7 +1139,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1234,7 +1234,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.b32 %p1, %r7, %r17;
 ; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
@@ -1255,10 +1255,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -1269,7 +1269,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.b32 %r15, [%rd1];
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1316,7 +1316,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [relaxed_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [relaxed_sys_i32_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: relaxed_sys_i32(
@@ -1325,9 +1325,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [relaxed_sys_i32_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [relaxed_sys_i32_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
 ; SM90-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1358,7 +1358,7 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acq_rel_sys_i32(
@@ -1367,9 +1367,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1400,7 +1400,7 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acquire_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [acquire_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [acquire_sys_i32_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acquire_sys_i32(
@@ -1409,9 +1409,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_sys_i32_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [acquire_sys_i32_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [acquire_sys_i32_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_sys_i32_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1442,7 +1442,7 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [release_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.b32 %r1, [release_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [release_sys_i32_param_2];
-; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: release_sys_i32(
@@ -1451,9 +1451,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_sys_i32_param_0];
-; SM90-NEXT:    ld.param.b32 %r1, [release_sys_i32_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [release_sys_i32_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [release_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_sys_i32_param_2];
 ; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1486,7 +1486,7 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
-; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: seq_cst_sys_i32(
@@ -1495,10 +1495,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
-; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
 ; SM90-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -1529,7 +1529,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
-; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: relaxed_sys_i64(
@@ -1537,9 +1537,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
 ; SM90-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1568,7 +1568,7 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acquire_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [acquire_sys_i64_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acquire_sys_i64(
@@ -1576,9 +1576,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acquire_sys_i64_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acquire_sys_i64_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acquire_sys_i64_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_sys_i64_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1607,7 +1607,7 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
-; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acq_rel_sys_i64(
@@ -1615,9 +1615,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1646,7 +1646,7 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.b64 %rd1, [release_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [release_sys_i64_param_2];
-; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: release_sys_i64(
@@ -1654,9 +1654,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [release_sys_i64_param_0];
-; SM90-NEXT:    ld.param.b64 %rd2, [release_sys_i64_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [release_sys_i64_param_2];
+; SM90-NEXT:    ld.param.u64 %rd1, [release_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_sys_i64_param_2];
 ; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1687,7 +1687,7 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
-; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: seq_cst_sys_i64(
@@ -1695,10 +1695,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
-; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
 ; SM90-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
index ae7450015ecd2..75623a59ad481 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.py
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -5,6 +5,14 @@
 from itertools import product
 
 cmpxchg_func = Template(
+    """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+    %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure
+    ret i$size %new
+}
+"""
+)
+
+cmpxchg_func_no_scope = Template(
     """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
     %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
     ret i$size %new
@@ -18,6 +26,14 @@
 """
 )
 
+
+def get_addrspace_cast(addrspace):
+    if addrspace == 0:
+        return ""
+    else:
+        return " addrspace({})".format(str(addrspace))
+
+
 TESTS = [(60, 50), (70, 63), (90, 87)]
 
 LLVM_SCOPES = ["", "block", "cluster", "device"]
@@ -34,24 +50,84 @@
 
 ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}
 
+
 if __name__ == "__main__":
     for sm, ptx in TESTS:
         with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
             print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
-            for size, success, failure, addrspace in product(
-                SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES
+
+            # Our test space is: SIZES X SUCCESS_ORDERINGS X FAILURE_ORDERINGS X ADDRSPACES X LLVM_SCOPES
+            # This is very large, so we instead test 3 slices.
+
+            # First slice:  are all orderings correctly supported, with and without emulation loops?
+            # set addrspace to global, scope to cta, generate all possible orderings, for all operation sizes
+            addrspace, llvm_scope = 1, "block"
+            for size, success, failure in product(
+                SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS
             ):
-                if addrspace == 0:
-                    addrspace_cast = ""
-                else:
-                    addrspace_cast = " addrspace({})".format(str(addrspace))
                 print(
                     cmpxchg_func.substitute(
                         success=success,
                         failure=failure,
                         size=size,
                         addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
-                        addrspace_cast=addrspace_cast,
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                        llvm_scope=llvm_scope,
+                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+                    ),
+                    file=fp,
+                )
+
+            # Second slice: Are all scopes correctlly supported, with and without emulation loops?
+            # fix addrspace, ordering, generate all possible scopes, for operation sizes i8, i32
+            addrspace, success, failure = 1, "acq_rel", "acquire"
+            for size in [8, 32]:
+                print(
+                    cmpxchg_func_no_scope.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                    ),
+                    file=fp,
+                )
+
+            for llvm_scope in LLVM_SCOPES:
+                if sm < 90 and llvm_scope == "cluster":
+                    continue
+                if llvm_scope == "block":
+                    # skip (acq_rel, acquire, global, cta)
+                    continue
+                print(
+                    cmpxchg_func.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                        llvm_scope=llvm_scope,
+                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+                    ),
+                    file=fp,
+                )
+
+            # Third slice: Are all address spaces correctly supported?
+            # fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32
+            success, failure, llvm_scope = "acq_rel", "acquire", "block"
+            for size, addrspace in product([8, 32], ADDRSPACES):
+                if addrspace == 1:
+                    # skip (acq_rel, acquire, global, cta)
+                    continue
+                print(
+                    cmpxchg_func.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=get_addrspace_cast(addrspace),
+                        llvm_scope=llvm_scope,
+                        ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
                     ),
                     file=fp,
                 )
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
index a1020e68e1bae..2841e6751d029 100644
--- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
+++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
@@ -171,30 +171,30 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.release.shared::cluster.cas.b32 %r27, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b32 %r28, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b32 %r29, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r30, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r31, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r32, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.release.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0;
-; CHECK-NEXT:    atom.acq_rel.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.release.sys.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acq_rel.sys.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0;
+; CHECK-NEXT:    atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0;
 ; CHECK-NEXT:    and.b64 %rd1, %rd2, -4;
 ; CHECK-NEXT:    cvt.u32.u64 %r33, %rd2;
 ; CHECK-NEXT:    and.b32 %r34, %r33, 3;
@@ -209,7 +209,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop33
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r39, %r48, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48;
 ; CHECK-NEXT:    setp.eq.b32 %p1, %r6, %r39;
 ; CHECK-NEXT:    @%p1 bra $L__BB4_3;
 ; CHECK-NEXT:  // %bb.2: // %partword.cmpxchg.failure32
@@ -224,7 +224,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_4: // %partword.cmpxchg.loop23
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r41, %r49, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49;
 ; CHECK-NEXT:    setp.eq.b32 %p3, %r10, %r41;
 ; CHECK-NEXT:    @%p3 bra $L__BB4_6;
 ; CHECK-NEXT:  // %bb.5: // %partword.cmpxchg.failure22
@@ -241,7 +241,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_7: // %partword.cmpxchg.loop13
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r43, %r50, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50;
 ; CHECK-NEXT:    setp.eq.b32 %p5, %r14, %r43;
 ; CHECK-NEXT:    @%p5 bra $L__BB4_9;
 ; CHECK-NEXT:  // %bb.8: // %partword.cmpxchg.failure12
@@ -257,7 +257,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_10: // %partword.cmpxchg.loop3
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r45, %r51, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51;
 ; CHECK-NEXT:    setp.eq.b32 %p7, %r18, %r45;
 ; CHECK-NEXT:    @%p7 bra $L__BB4_12;
 ; CHECK-NEXT:  // %bb.11: // %partword.cmpxchg.failure2
@@ -274,7 +274,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_13: // %partword.cmpxchg.loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    or.b32 %r47, %r52, %r3;
-; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52;
+; CHECK-NEXT:    atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52;
 ; CHECK-NEXT:    setp.eq.b32 %p9, %r22, %r47;
 ; CHECK-NEXT:    @%p9 bra $L__BB4_15;
 ; CHECK-NEXT:  // %bb.14: // %partword.cmpxchg.failure
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 410c0019c7222..cbc9f700b1f01 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1,14 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; ## Support i16x2 instructions
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \
-; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | FileCheck -allow-deprecated-dag-overlap %s
-; RUN: %if ptxas %{                                                           \
-; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \
-; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN:   | %ptxas-verify -arch=sm_90                                          \
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefixes=CHECK,O0
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefixes=CHECK,O3
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_90 -mattr=+ptx80 -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:   | %ptxas-verify -arch=sm_90                                           \
+; RUN: %}
+; RUN: %if ptxas %{                                                            \
+; RUN:   llc < %s -mcpu=sm_90 -mattr=+ptx80 -verify-machineinstrs              \
+; RUN:   | %ptxas-verify -arch=sm_90                                           \
 ; RUN: %}
 
+target triple = "nvptx64-nvidia-cuda"
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 define <4 x i8> @test_ret_const() #0 {
@@ -79,61 +84,111 @@ define i8 @test_extract_3(<4 x i8> %a) #0 {
 }
 
 define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
-; CHECK-LABEL: test_extract_i(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    or.b32 %r3, %r2, 30576;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, %r3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_extract_i(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-NEXT:    .reg .b64 %rd<2>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
+; O0-NEXT:    cvt.u32.u64 %r2, %rd1;
+; O0-NEXT:    or.b32 %r3, %r2, 30576;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, %r3;
+; O0-NEXT:    st.param.b32 [func_retval0], %r4;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_extract_i(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_extract_i_param_1];
+; O3-NEXT:    or.b32 %r3, %r2, 30576;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r4;
+; O3-NEXT:    ret;
   %e = extractelement <4 x i8> %a, i64 %idx
   ret i8 %e
 }
 
 define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_add(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_add_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_add_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_add(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_add_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    add.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    add.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    add.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    add.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_add(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_add_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    add.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    add.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    add.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    add.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %r = add <4 x i8> %a, %b
   ret <4 x i8> %r
 }
@@ -205,341 +260,631 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 }
 
 define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_sub(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_sub(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    sub.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    sub.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    sub.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    sub.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_sub(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    sub.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    sub.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    sub.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    sub.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %r = sub <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_smax(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_smax(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.gt.s32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.gt.s32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.gt.s32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.gt.s32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_smax(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.gt.s32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.gt.s32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.gt.s32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.gt.s32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_umax(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.gt.u32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_umax(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_umax(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp ugt <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_smin(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_smin(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.le.s32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.le.s32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.le.s32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.le.s32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_smin(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.le.s32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.le.s32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.le.s32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.le.s32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_umin(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.le.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.le.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.le.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.le.u32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
-; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_umin(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.le.u32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.le.u32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.le.u32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.le.u32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O0-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O0-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_umin(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.le.u32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.le.u32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.le.u32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.le.u32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
+; O3-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
+; O3-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %cmp = icmp ule <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
-; CHECK-LABEL: test_eq(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r3, [test_eq_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.eq.b32 %p1, %r5, %r4;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.eq.b32 %p2, %r7, %r6;
-; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.eq.b32 %p3, %r9, %r8;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.eq.b32 %p4, %r11, %r10;
-; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
-; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_eq(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<23>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r3, [test_eq_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
+; O0-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.eq.b32 %p1, %r5, %r4;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.eq.b32 %p2, %r7, %r6;
+; O0-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.eq.b32 %p3, %r9, %r8;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.eq.b32 %p4, %r11, %r10;
+; O0-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
+; O0-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
+; O0-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
+; O0-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O0-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r22;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_eq(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<23>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.eq.b32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.eq.b32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.eq.b32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.eq.b32 %p4, %r10, %r9;
+; O3-NEXT:    ld.param.b32 %r11, [test_eq_param_2];
+; O3-NEXT:    prmt.b32 %r12, %r11, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r13, %r10, %r12, %p4;
+; O3-NEXT:    prmt.b32 %r14, %r11, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r15, %r8, %r14, %p3;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r11, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r18, %r6, %r17, %p2;
+; O3-NEXT:    prmt.b32 %r19, %r11, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r20, %r4, %r19, %p1;
+; O3-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O3-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r22;
+; O3-NEXT:    ret;
   %cmp = icmp eq <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
-; CHECK-LABEL: test_ne(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r3, [test_ne_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r5, %r4;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r7, %r6;
-; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r9, %r8;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r11, %r10;
-; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
-; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ne(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<23>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r3, [test_ne_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
+; O0-NEXT:    prmt.b32 %r4, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.ne.b32 %p1, %r5, %r4;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.ne.b32 %p2, %r7, %r6;
+; O0-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r9, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.ne.b32 %p3, %r9, %r8;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.ne.b32 %p4, %r11, %r10;
+; O0-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
+; O0-NEXT:    prmt.b32 %r14, %r3, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r3, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
+; O0-NEXT:    prmt.b32 %r19, %r3, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
+; O0-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O0-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r22;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ne(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<23>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.ne.b32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.ne.b32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.ne.b32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.ne.b32 %p4, %r10, %r9;
+; O3-NEXT:    ld.param.b32 %r11, [test_ne_param_2];
+; O3-NEXT:    prmt.b32 %r12, %r11, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r13, %r10, %r12, %p4;
+; O3-NEXT:    prmt.b32 %r14, %r11, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r15, %r8, %r14, %p3;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r11, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r18, %r6, %r17, %p2;
+; O3-NEXT:    prmt.b32 %r19, %r11, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r20, %r4, %r19, %p1;
+; O3-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
+; O3-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r22;
+; O3-NEXT:    ret;
   %cmp = icmp ne <4 x i8> %a, %b
   %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_mul(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_mul(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_mul(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r17;
+; O3-NEXT:    ret;
   %r = mul <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_or(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_or_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_or_param_0];
-; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_or(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_or_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_or_param_0];
+; O0-NEXT:    or.b32 %r3, %r1, %r2;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_or(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_or_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_or_param_1];
+; O3-NEXT:    or.b32 %r3, %r1, %r2;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = or <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_or_computed(i8 %a) {
-; CHECK-LABEL: test_or_computed(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_or_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    or.b32 %r7, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_or_computed(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_or_computed_param_0];
+; O0-NEXT:    mov.b32 %r1, 0;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; O0-NEXT:    or.b32 %r7, %r6, %r5;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_or_computed(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_or_computed_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x5410U;
+; O3-NEXT:    bfi.b32 %r4, 5, %r3, 8, 8;
+; O3-NEXT:    or.b32 %r5, %r4, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r5;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %r = or <4 x i8> %ins.1, %ins.0
@@ -575,37 +920,61 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
 }
 
 define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_xor(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
-; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_xor(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
+; O0-NEXT:    xor.b32 %r3, %r1, %r2;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_xor(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
+; O3-NEXT:    xor.b32 %r3, %r1, %r2;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = xor <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_xor_computed(i8 %a) {
-; CHECK-LABEL: test_xor_computed(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_xor_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    xor.b32 %r7, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_xor_computed(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_xor_computed_param_0];
+; O0-NEXT:    mov.b32 %r1, 0;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; O0-NEXT:    xor.b32 %r7, %r6, %r5;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_xor_computed(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_xor_computed_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x5410U;
+; O3-NEXT:    bfi.b32 %r4, 5, %r3, 8, 8;
+; O3-NEXT:    xor.b32 %r5, %r4, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r5;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %r = xor <4 x i8> %ins.1, %ins.0
@@ -641,37 +1010,61 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
 }
 
 define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_and(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_and_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_and_param_0];
-; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_and(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_and_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_and_param_0];
+; O0-NEXT:    and.b32 %r3, %r1, %r2;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_and(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_and_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_and_param_1];
+; O3-NEXT:    and.b32 %r3, %r1, %r2;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = and <4 x i8> %a, %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_and_computed(i8 %a) {
-; CHECK-LABEL: test_and_computed(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_and_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    and.b32 %r7, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_and_computed(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_and_computed_param_0];
+; O0-NEXT:    mov.b32 %r1, 0;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; O0-NEXT:    and.b32 %r7, %r6, %r5;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_and_computed(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_and_computed_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x5410U;
+; O3-NEXT:    bfi.b32 %r4, 5, %r3, 8, 8;
+; O3-NEXT:    and.b32 %r5, %r4, %r3;
+; O3-NEXT:    st.param.b32 [func_retval0], %r5;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %r = and <4 x i8> %ins.1, %ins.0
@@ -707,76 +1100,132 @@ define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 {
 }
 
 define void @test_ldst_v2i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v2i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    st.b32 [%rd2], %r1;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v2i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<2>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    st.b32 [%rd2], %r1;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v2i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<2>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
+; O3-NEXT:    st.b32 [%rd2], %r1;
+; O3-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a
   store <4 x i8> %t1, ptr %b, align 16
   ret void
 }
 
 define void @test_ldst_v3i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v3i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    st.b16 [%rd2], %r1;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
-; CHECK-NEXT:    st.b8 [%rd2+2], %r2;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v3i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<3>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    st.b16 [%rd2], %r1;
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
+; O0-NEXT:    st.b8 [%rd2+2], %r2;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v3i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<3>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
+; O3-NEXT:    st.b16 [%rd2], %r1;
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x7772U;
+; O3-NEXT:    st.b8 [%rd2+2], %r2;
+; O3-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
   store <3 x i8> %t1, ptr %b, align 16
   ret void
 }
 
 define void @test_ldst_v4i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v4i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<2>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    st.b32 [%rd2], %r1;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v4i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<2>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    st.b32 [%rd2], %r1;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v4i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<2>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
+; O3-NEXT:    st.b32 [%rd2], %r1;
+; O3-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a
   store <4 x i8> %t1, ptr %b, align 16
   ret void
 }
 
 define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v4i8_unaligned(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
-; CHECK-NEXT:    ld.b8 %r1, [%rd1];
-; CHECK-NEXT:    ld.b8 %r2, [%rd1+1];
-; CHECK-NEXT:    ld.b8 %r3, [%rd1+2];
-; CHECK-NEXT:    ld.b8 %r4, [%rd1+3];
-; CHECK-NEXT:    st.b8 [%rd2+3], %r4;
-; CHECK-NEXT:    st.b8 [%rd2+2], %r3;
-; CHECK-NEXT:    st.b8 [%rd2+1], %r2;
-; CHECK-NEXT:    st.b8 [%rd2], %r1;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v4i8_unaligned(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
+; O0-NEXT:    ld.b8 %r1, [%rd1];
+; O0-NEXT:    ld.b8 %r2, [%rd1+1];
+; O0-NEXT:    ld.b8 %r3, [%rd1+2];
+; O0-NEXT:    ld.b8 %r4, [%rd1+3];
+; O0-NEXT:    st.b8 [%rd2+3], %r4;
+; O0-NEXT:    st.b8 [%rd2+2], %r3;
+; O0-NEXT:    st.b8 [%rd2+1], %r2;
+; O0-NEXT:    st.b8 [%rd2], %r1;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v4i8_unaligned(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
+; O3-NEXT:    ld.b8 %r1, [%rd1+1];
+; O3-NEXT:    ld.b8 %r2, [%rd1];
+; O3-NEXT:    ld.b8 %r3, [%rd1+3];
+; O3-NEXT:    ld.b8 %r4, [%rd1+2];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
+; O3-NEXT:    st.b8 [%rd2+2], %r4;
+; O3-NEXT:    st.b8 [%rd2+3], %r3;
+; O3-NEXT:    st.b8 [%rd2], %r2;
+; O3-NEXT:    st.b8 [%rd2+1], %r1;
+; O3-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a, align 1
   store <4 x i8> %t1, ptr %b, align 1
   ret void
@@ -784,17 +1233,29 @@ define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
 
 
 define void @test_ldst_v8i8(ptr %a, ptr %b) {
-; CHECK-LABEL: test_ldst_v8i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
-; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
-; CHECK-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_ldst_v8i8(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<3>;
+; O0-NEXT:    .reg .b64 %rd<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
+; O0-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; O0-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_ldst_v8i8(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<3>;
+; O3-NEXT:    .reg .b64 %rd<3>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
+; O3-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
+; O3-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
+; O3-NEXT:    ret;
   %t1 = load <8 x i8>, ptr %a
   store <8 x i8> %t1, ptr %b, align 16
   ret void
@@ -803,168 +1264,310 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
 declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0
 
 define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_call(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
-; CHECK-NEXT:    { // callseq 0, 0
-; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r2;
-; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_call(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_call_param_0];
+; O0-NEXT:    { // callseq 0, 0
+; O0-NEXT:    .param .align 4 .b8 param0[4];
+; O0-NEXT:    st.param.b32 [param0], %r1;
+; O0-NEXT:    .param .align 4 .b8 param1[4];
+; O0-NEXT:    st.param.b32 [param1], %r2;
+; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O0-NEXT:    ld.param.b32 %r3, [retval0];
+; O0-NEXT:    } // callseq 0
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_call(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_call_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; O3-NEXT:    { // callseq 0, 0
+; O3-NEXT:    .param .align 4 .b8 param0[4];
+; O3-NEXT:    st.param.b32 [param0], %r1;
+; O3-NEXT:    .param .align 4 .b8 param1[4];
+; O3-NEXT:    st.param.b32 [param1], %r2;
+; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O3-NEXT:    ld.param.b32 %r3, [retval0];
+; O3-NEXT:    } // callseq 0
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b)
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_call_flipped(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
-; CHECK-NEXT:    { // callseq 1, 0
-; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r2;
-; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r1;
-; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_call_flipped(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
+; O0-NEXT:    { // callseq 1, 0
+; O0-NEXT:    .param .align 4 .b8 param0[4];
+; O0-NEXT:    st.param.b32 [param0], %r2;
+; O0-NEXT:    .param .align 4 .b8 param1[4];
+; O0-NEXT:    st.param.b32 [param1], %r1;
+; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O0-NEXT:    ld.param.b32 %r3, [retval0];
+; O0-NEXT:    } // callseq 1
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_call_flipped(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; O3-NEXT:    { // callseq 1, 0
+; O3-NEXT:    .param .align 4 .b8 param0[4];
+; O3-NEXT:    st.param.b32 [param0], %r2;
+; O3-NEXT:    .param .align 4 .b8 param1[4];
+; O3-NEXT:    st.param.b32 [param1], %r1;
+; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O3-NEXT:    ld.param.b32 %r3, [retval0];
+; O3-NEXT:    } // callseq 1
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
-; CHECK-NEXT:    { // callseq 2, 0
-; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r2;
-; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r1;
-; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
-; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    } // callseq 2
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_tailcall_flipped(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
+; O0-NEXT:    { // callseq 2, 0
+; O0-NEXT:    .param .align 4 .b8 param0[4];
+; O0-NEXT:    st.param.b32 [param0], %r2;
+; O0-NEXT:    .param .align 4 .b8 param1[4];
+; O0-NEXT:    st.param.b32 [param1], %r1;
+; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O0-NEXT:    ld.param.b32 %r3, [retval0];
+; O0-NEXT:    } // callseq 2
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_tailcall_flipped(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; O3-NEXT:    { // callseq 2, 0
+; O3-NEXT:    .param .align 4 .b8 param0[4];
+; O3-NEXT:    st.param.b32 [param0], %r2;
+; O3-NEXT:    .param .align 4 .b8 param1[4];
+; O3-NEXT:    st.param.b32 [param1], %r1;
+; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; O3-NEXT:    ld.param.b32 %r3, [retval0];
+; O3-NEXT:    } // callseq 2
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
-; CHECK-LABEL: test_select(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b16 %rs<3>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
-; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_select_param_0];
-; CHECK-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<2>;
+; O0-NEXT:    .reg .b16 %rs<3>;
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
+; O0-NEXT:    and.b16 %rs2, %rs1, 1;
+; O0-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; O0-NEXT:    ld.param.b32 %r2, [test_select_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_select_param_0];
+; O0-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<2>;
+; O3-NEXT:    .reg .b16 %rs<3>;
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
+; O3-NEXT:    and.b16 %rs2, %rs1, 1;
+; O3-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; O3-NEXT:    ld.param.b32 %r1, [test_select_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_select_param_1];
+; O3-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %r = select i1 %c, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 {
-; CHECK-LABEL: test_select_cc(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<28>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
-; CHECK-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r6, %r3, 0, 0x7770U;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r8, %r3, 0, 0x7771U;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r4, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r10, %r3, 0, 0x7772U;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r10, %r9;
-; CHECK-NEXT:    prmt.b32 %r11, %r4, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r12, %r11;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    prmt.b32 %r16, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r17, %r1, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
-; CHECK-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r20, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; CHECK-NEXT:    prmt.b32 %r23, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r24, %r1, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
-; CHECK-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r27;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select_cc(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<28>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; O0-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; O0-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r6, %r3, 0, 0x7770U;
+; O0-NEXT:    setp.ne.b32 %p1, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r8, %r3, 0, 0x7771U;
+; O0-NEXT:    setp.ne.b32 %p2, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r4, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r10, %r3, 0, 0x7772U;
+; O0-NEXT:    setp.ne.b32 %p3, %r10, %r9;
+; O0-NEXT:    prmt.b32 %r11, %r4, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r12, %r3, 0, 0x7773U;
+; O0-NEXT:    setp.ne.b32 %p4, %r12, %r11;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
+; O0-NEXT:    prmt.b32 %r16, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r17, %r1, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
+; O0-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
+; O0-NEXT:    prmt.b32 %r20, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r21, %r1, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
+; O0-NEXT:    prmt.b32 %r23, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r24, %r1, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
+; O0-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
+; O0-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r27;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select_cc(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<28>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_select_cc_param_3];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    ld.param.b32 %r4, [test_select_cc_param_2];
+; O3-NEXT:    prmt.b32 %r5, %r4, 0, 0x7770U;
+; O3-NEXT:    setp.ne.b32 %p1, %r5, %r3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r7, %r4, 0, 0x7771U;
+; O3-NEXT:    setp.ne.b32 %p2, %r7, %r6;
+; O3-NEXT:    prmt.b32 %r8, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r9, %r4, 0, 0x7772U;
+; O3-NEXT:    setp.ne.b32 %p3, %r9, %r8;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r11, %r4, 0, 0x7773U;
+; O3-NEXT:    setp.ne.b32 %p4, %r11, %r10;
+; O3-NEXT:    ld.param.b32 %r12, [test_select_cc_param_1];
+; O3-NEXT:    prmt.b32 %r13, %r12, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
+; O3-NEXT:    prmt.b32 %r16, %r12, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r17, %r1, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
+; O3-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
+; O3-NEXT:    prmt.b32 %r20, %r12, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r21, %r1, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
+; O3-NEXT:    prmt.b32 %r23, %r12, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r24, %r1, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
+; O3-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
+; O3-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r27;
+; O3-NEXT:    ret;
   %cc = icmp ne <4 x i8> %c, %d
   %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
   ret <4 x i8> %r
 }
 
 define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
-; CHECK-LABEL: test_select_cc_i32_i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
-; CHECK-NEXT:    ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
-; CHECK-NEXT:    ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
-; CHECK-NEXT:    prmt.b32 %r11, %r10, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r12, %r9, 0, 0x7770U;
-; CHECK-NEXT:    setp.ne.b32 %p1, %r12, %r11;
-; CHECK-NEXT:    prmt.b32 %r13, %r10, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r14, %r9, 0, 0x7771U;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r14, %r13;
-; CHECK-NEXT:    prmt.b32 %r15, %r10, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r16, %r9, 0, 0x7772U;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r16, %r15;
-; CHECK-NEXT:    prmt.b32 %r17, %r10, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r18, %r9, 0, 0x7773U;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r18, %r17;
-; CHECK-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
-; CHECK-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
-; CHECK-NEXT:    selp.b32 %r21, %r2, %r6, %p2;
-; CHECK-NEXT:    selp.b32 %r22, %r1, %r5, %p1;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19};
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select_cc_i32_i8(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<23>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
+; O0-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
+; O0-NEXT:    ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
+; O0-NEXT:    ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
+; O0-NEXT:    prmt.b32 %r11, %r10, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r12, %r9, 0, 0x7770U;
+; O0-NEXT:    setp.ne.b32 %p1, %r12, %r11;
+; O0-NEXT:    prmt.b32 %r13, %r10, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r14, %r9, 0, 0x7771U;
+; O0-NEXT:    setp.ne.b32 %p2, %r14, %r13;
+; O0-NEXT:    prmt.b32 %r15, %r10, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r16, %r9, 0, 0x7772U;
+; O0-NEXT:    setp.ne.b32 %p3, %r16, %r15;
+; O0-NEXT:    prmt.b32 %r17, %r10, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r18, %r9, 0, 0x7773U;
+; O0-NEXT:    setp.ne.b32 %p4, %r18, %r17;
+; O0-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
+; O0-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
+; O0-NEXT:    selp.b32 %r21, %r2, %r6, %p2;
+; O0-NEXT:    selp.b32 %r22, %r1, %r5, %p1;
+; O0-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19};
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select_cc_i32_i8(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<23>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
+; O3-NEXT:    ld.param.b32 %r5, [test_select_cc_i32_i8_param_3];
+; O3-NEXT:    prmt.b32 %r6, %r5, 0, 0x7770U;
+; O3-NEXT:    ld.param.b32 %r7, [test_select_cc_i32_i8_param_2];
+; O3-NEXT:    prmt.b32 %r8, %r7, 0, 0x7770U;
+; O3-NEXT:    setp.ne.b32 %p1, %r8, %r6;
+; O3-NEXT:    prmt.b32 %r9, %r5, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r10, %r7, 0, 0x7771U;
+; O3-NEXT:    setp.ne.b32 %p2, %r10, %r9;
+; O3-NEXT:    prmt.b32 %r11, %r5, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r12, %r7, 0, 0x7772U;
+; O3-NEXT:    setp.ne.b32 %p3, %r12, %r11;
+; O3-NEXT:    prmt.b32 %r13, %r5, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r14, %r7, 0, 0x7773U;
+; O3-NEXT:    setp.ne.b32 %p4, %r14, %r13;
+; O3-NEXT:    ld.param.v4.b32 {%r15, %r16, %r17, %r18}, [test_select_cc_i32_i8_param_1];
+; O3-NEXT:    selp.b32 %r19, %r4, %r18, %p4;
+; O3-NEXT:    selp.b32 %r20, %r3, %r17, %p3;
+; O3-NEXT:    selp.b32 %r21, %r2, %r16, %p2;
+; O3-NEXT:    selp.b32 %r22, %r1, %r15, %p1;
+; O3-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19};
+; O3-NEXT:    ret;
                                            <4 x i8> %c, <4 x i8> %d) #0 {
   %cc = icmp ne <4 x i8> %c, %d
   %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b
@@ -972,37 +1575,69 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
 }
 
 define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
-; CHECK-LABEL: test_select_cc_i8_i32(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<26>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
-; CHECK-NEXT:    ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_i8_i32_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
-; CHECK-NEXT:    setp.ne.b32 %p1, %r3, %r7;
-; CHECK-NEXT:    setp.ne.b32 %p2, %r4, %r8;
-; CHECK-NEXT:    setp.ne.b32 %p3, %r5, %r9;
-; CHECK-NEXT:    setp.ne.b32 %p4, %r6, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r12, %r1, 0, 0x7773U;
-; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    prmt.b32 %r14, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r15, %r1, 0, 0x7772U;
-; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r18, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
-; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
-; CHECK-NEXT:    prmt.b32 %r21, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r22, %r1, 0, 0x7770U;
-; CHECK-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
-; CHECK-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_select_cc_i8_i32(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<26>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
+; O0-NEXT:    ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
+; O0-NEXT:    ld.param.b32 %r2, [test_select_cc_i8_i32_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
+; O0-NEXT:    setp.ne.b32 %p1, %r3, %r7;
+; O0-NEXT:    setp.ne.b32 %p2, %r4, %r8;
+; O0-NEXT:    setp.ne.b32 %p3, %r5, %r9;
+; O0-NEXT:    setp.ne.b32 %p4, %r6, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r12, %r1, 0, 0x7773U;
+; O0-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
+; O0-NEXT:    prmt.b32 %r14, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r15, %r1, 0, 0x7772U;
+; O0-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
+; O0-NEXT:    prmt.b32 %r18, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
+; O0-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
+; O0-NEXT:    prmt.b32 %r21, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r22, %r1, 0, 0x7770U;
+; O0-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
+; O0-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
+; O0-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r25;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_select_cc_i8_i32(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<26>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
+; O3-NEXT:    ld.param.v4.b32 {%r2, %r3, %r4, %r5}, [test_select_cc_i8_i32_param_2];
+; O3-NEXT:    ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [test_select_cc_i8_i32_param_3];
+; O3-NEXT:    setp.ne.b32 %p1, %r2, %r6;
+; O3-NEXT:    setp.ne.b32 %p2, %r3, %r7;
+; O3-NEXT:    setp.ne.b32 %p3, %r4, %r8;
+; O3-NEXT:    setp.ne.b32 %p4, %r5, %r9;
+; O3-NEXT:    ld.param.b32 %r10, [test_select_cc_i8_i32_param_1];
+; O3-NEXT:    prmt.b32 %r11, %r10, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r12, %r1, 0, 0x7773U;
+; O3-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
+; O3-NEXT:    prmt.b32 %r14, %r10, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r15, %r1, 0, 0x7772U;
+; O3-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
+; O3-NEXT:    prmt.b32 %r18, %r10, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
+; O3-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
+; O3-NEXT:    prmt.b32 %r21, %r10, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r22, %r1, 0, 0x7770U;
+; O3-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
+; O3-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
+; O3-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r25;
+; O3-NEXT:    ret;
                                           <4 x i32> %c, <4 x i32> %d) #0 {
   %cc = icmp ne <4 x i32> %c, %d
   %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
@@ -1027,23 +1662,41 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
 }
 
 define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
-; CHECK-LABEL: test_trunc_2xi64(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
-; CHECK-NEXT:    .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd4;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd3;
-; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u64 %r4, %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd1;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r7, %r6, %r3, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_trunc_2xi64(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<8>;
+; O0-NEXT:    .reg .b64 %rd<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
+; O0-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; O0-NEXT:    cvt.u32.u64 %r1, %rd4;
+; O0-NEXT:    cvt.u32.u64 %r2, %rd3;
+; O0-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; O0-NEXT:    cvt.u32.u64 %r4, %rd2;
+; O0-NEXT:    cvt.u32.u64 %r5, %rd1;
+; O0-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O0-NEXT:    prmt.b32 %r7, %r6, %r3, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r7;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_trunc_2xi64(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<8>;
+; O3-NEXT:    .reg .b64 %rd<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; O3-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
+; O3-NEXT:    cvt.u32.u64 %r1, %rd4;
+; O3-NEXT:    cvt.u32.u64 %r2, %rd3;
+; O3-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; O3-NEXT:    cvt.u32.u64 %r4, %rd2;
+; O3-NEXT:    cvt.u32.u64 %r5, %rd1;
+; O3-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O3-NEXT:    prmt.b32 %r7, %r6, %r3, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r7;
+; O3-NEXT:    ret;
   %r = trunc <4 x i64> %a to <4 x i8>
   ret <4 x i8> %r
 }
@@ -1066,24 +1719,43 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
 }
 
 define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
-; CHECK-LABEL: test_zext_2xi64(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<6>;
-; CHECK-NEXT:    .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
-; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
-; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x7772U;
-; CHECK-NEXT:    cvt.u64.u32 %rd2, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7771U;
-; CHECK-NEXT:    cvt.u64.u32 %rd3, %r4;
-; CHECK-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
-; CHECK-NEXT:    cvt.u64.u32 %rd4, %r5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd4, %rd3};
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd2, %rd1};
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_zext_2xi64(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<6>;
+; O0-NEXT:    .reg .b64 %rd<5>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
+; O0-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
+; O0-NEXT:    cvt.u64.u32 %rd1, %r2;
+; O0-NEXT:    prmt.b32 %r3, %r1, 0, 0x7772U;
+; O0-NEXT:    cvt.u64.u32 %rd2, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7771U;
+; O0-NEXT:    cvt.u64.u32 %rd3, %r4;
+; O0-NEXT:    prmt.b32 %r5, %r1, 0, 0x7770U;
+; O0-NEXT:    cvt.u64.u32 %rd4, %r5;
+; O0-NEXT:    st.param.v2.b64 [func_retval0], {%rd4, %rd3};
+; O0-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd2, %rd1};
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_zext_2xi64(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<6>;
+; O3-NEXT:    .reg .b64 %rd<5>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 0, 0x7771U;
+; O3-NEXT:    cvt.u64.u32 %rd1, %r2;
+; O3-NEXT:    prmt.b32 %r3, %r1, 0, 0x7770U;
+; O3-NEXT:    cvt.u64.u32 %rd2, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7773U;
+; O3-NEXT:    cvt.u64.u32 %rd3, %r4;
+; O3-NEXT:    prmt.b32 %r5, %r1, 0, 0x7772U;
+; O3-NEXT:    cvt.u64.u32 %rd4, %r5;
+; O3-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd4, %rd3};
+; O3-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; O3-NEXT:    ret;
   %r = zext <4 x i8> %a to <4 x i64>
   ret <4 x i64> %r
 }
@@ -1142,20 +1814,31 @@ define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
 
 
 define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
-; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<6>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 6;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 7, 0x3340U;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_bitcast_4xi8_to_2xhalf(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<6>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
+; O0-NEXT:    mov.b32 %r1, 6;
+; O0-NEXT:    prmt.b32 %r2, %r1, 7, 0x3340U;
+; O0-NEXT:    cvt.u32.u16 %r3, %rs1;
+; O0-NEXT:    prmt.b32 %r4, %r3, 5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r5;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_bitcast_4xi8_to_2xhalf(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b8 %r1, [test_bitcast_4xi8_to_2xhalf_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, 5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r3, %r2, 1798, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
   %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2
@@ -1166,153 +1849,277 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
 
 
 define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
-; CHECK-LABEL: test_shufflevector(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
-; CHECK-NEXT:    // implicit-def: %r3
-; CHECK-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_shufflevector(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; O0-NEXT:    // implicit-def: %r3
+; O0-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r2;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_shufflevector(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; O3-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r2;
+; O3-NEXT:    ret;
   %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i8> %s
 }
 
 define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 {
-; CHECK-LABEL: test_shufflevector_2(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r2, [test_shufflevector_2_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_2_param_0];
-; CHECK-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_shufflevector_2(
+; O0:       {
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b32 %r2, [test_shufflevector_2_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_shufflevector_2_param_0];
+; O0-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_shufflevector_2(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_shufflevector_2_param_0];
+; O3-NEXT:    ld.param.b32 %r2, [test_shufflevector_2_param_1];
+; O3-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 7, i32 3, i32 5, i32 2>
   ret <4 x i8> %s
 }
 
 
 define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
-; CHECK-LABEL: test_insertelement(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
-; CHECK-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_insertelement(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-NEXT:    .reg .b32 %r<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b8 %rs1, [test_insertelement_param_1];
+; O0-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; O0-NEXT:    cvt.u32.u16 %r2, %rs1;
+; O0-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
+; O0-NEXT:    st.param.b32 [func_retval0], %r3;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_insertelement(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; O3-NEXT:    ld.param.b8 %r2, [test_insertelement_param_1];
+; O3-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
+; O3-NEXT:    st.param.b32 [func_retval0], %r3;
+; O3-NEXT:    ret;
   %i = insertelement <4 x i8> %a, i8 %x, i64 1
   ret <4 x i8> %i
 }
 
 define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
-; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs5, %rs4;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs6, %rs3;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs2;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs1;
-; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
-; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
-; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_fptosi_4xhalf_to_4xi8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<12>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; O0-NEXT:    cvt.rzi.s16.f16 %rs5, %rs4;
+; O0-NEXT:    cvt.rzi.s16.f16 %rs6, %rs3;
+; O0-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; O0-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; O0-NEXT:    cvt.u32.u16 %r4, %rs8;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs7;
+; O0-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O0-NEXT:    cvt.rzi.s16.f16 %rs9, %rs2;
+; O0-NEXT:    cvt.rzi.s16.f16 %rs10, %rs1;
+; O0-NEXT:    mov.b32 %r7, {%rs10, %rs9};
+; O0-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs12;
+; O0-NEXT:    cvt.u32.u16 %r9, %rs11;
+; O0-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
+; O0-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r11;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_fptosi_4xhalf_to_4xi8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<10>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; O3-NEXT:    cvt.rzi.s16.f16 %rs5, %rs4;
+; O3-NEXT:    cvt.rzi.s16.f16 %rs6, %rs3;
+; O3-NEXT:    mov.b32 %r1, {%rs6, %rs5};
+; O3-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; O3-NEXT:    cvt.u32.u16 %r2, %rs8;
+; O3-NEXT:    cvt.u32.u16 %r3, %rs7;
+; O3-NEXT:    prmt.b32 %r4, %r3, %r2, 0x3340U;
+; O3-NEXT:    cvt.rzi.s16.f16 %rs9, %rs2;
+; O3-NEXT:    cvt.rzi.s16.f16 %rs10, %rs1;
+; O3-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; O3-NEXT:    mov.b32 {%rs11, %rs12}, %r5;
+; O3-NEXT:    cvt.u32.u16 %r6, %rs12;
+; O3-NEXT:    cvt.u32.u16 %r7, %rs11;
+; O3-NEXT:    prmt.b32 %r8, %r7, %r6, 0x3340U;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r4, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r9;
+; O3-NEXT:    ret;
   %r = fptosi <4 x half> %a to <4 x i8>
   ret <4 x i8> %r
 }
 
 define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
-; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs5, %rs4;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs6, %rs3;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs2;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs1;
-; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
-; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
-; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_fptoui_4xhalf_to_4xi8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<12>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; O0-NEXT:    cvt.rzi.u16.f16 %rs5, %rs4;
+; O0-NEXT:    cvt.rzi.u16.f16 %rs6, %rs3;
+; O0-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; O0-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; O0-NEXT:    cvt.u32.u16 %r4, %rs8;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs7;
+; O0-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; O0-NEXT:    cvt.rzi.u16.f16 %rs9, %rs2;
+; O0-NEXT:    cvt.rzi.u16.f16 %rs10, %rs1;
+; O0-NEXT:    mov.b32 %r7, {%rs10, %rs9};
+; O0-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs12;
+; O0-NEXT:    cvt.u32.u16 %r9, %rs11;
+; O0-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
+; O0-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
+; O0-NEXT:    st.param.b32 [func_retval0], %r11;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_fptoui_4xhalf_to_4xi8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<10>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; O3-NEXT:    cvt.rzi.u16.f16 %rs5, %rs4;
+; O3-NEXT:    cvt.rzi.u16.f16 %rs6, %rs3;
+; O3-NEXT:    mov.b32 %r1, {%rs6, %rs5};
+; O3-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; O3-NEXT:    cvt.u32.u16 %r2, %rs8;
+; O3-NEXT:    cvt.u32.u16 %r3, %rs7;
+; O3-NEXT:    prmt.b32 %r4, %r3, %r2, 0x3340U;
+; O3-NEXT:    cvt.rzi.u16.f16 %rs9, %rs2;
+; O3-NEXT:    cvt.rzi.u16.f16 %rs10, %rs1;
+; O3-NEXT:    mov.b32 %r5, {%rs10, %rs9};
+; O3-NEXT:    mov.b32 {%rs11, %rs12}, %r5;
+; O3-NEXT:    cvt.u32.u16 %r6, %rs12;
+; O3-NEXT:    cvt.u32.u16 %r7, %rs11;
+; O3-NEXT:    prmt.b32 %r8, %r7, %r6, 0x3340U;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r4, 0x5410U;
+; O3-NEXT:    st.param.b32 [func_retval0], %r9;
+; O3-NEXT:    ret;
   %r = fptoui <4 x half> %a to <4 x i8>
   ret <4 x i8> %r
 }
 
 define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: test_srem_v4i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_srem_v4i8_param_2];
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_srem_v4i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    ld.b32 %r2, [%rd2];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0xbbb3U;
-; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0xbbb3U;
-; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
-; CHECK-NEXT:    rem.s16 %rs3, %rs2, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0xaaa2U;
-; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0xaaa2U;
-; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    rem.s16 %rs6, %rs5, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    prmt.b32 %r11, %r1, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
-; CHECK-NEXT:    rem.s16 %rs9, %rs8, %rs7;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    prmt.b32 %r13, %r2, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
-; CHECK-NEXT:    rem.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.b32 [%rd3], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_srem_v4i8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<13>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-NEXT:    .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0: // %entry
+; O0-NEXT:    ld.param.b64 %rd3, [test_srem_v4i8_param_2];
+; O0-NEXT:    ld.param.b64 %rd2, [test_srem_v4i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    ld.b32 %r2, [%rd2];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0xbbb3U;
+; O0-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0xbbb3U;
+; O0-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O0-NEXT:    rem.s16 %rs3, %rs2, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0xaaa2U;
+; O0-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0xaaa2U;
+; O0-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O0-NEXT:    rem.s16 %rs6, %rs5, %rs4;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    prmt.b32 %r10, %r2, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O0-NEXT:    prmt.b32 %r11, %r1, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O0-NEXT:    rem.s16 %rs9, %rs8, %rs7;
+; O0-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O0-NEXT:    prmt.b32 %r13, %r2, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O0-NEXT:    prmt.b32 %r14, %r1, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O0-NEXT:    rem.s16 %rs12, %rs11, %rs10;
+; O0-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O0-NEXT:    st.b32 [%rd3], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_srem_v4i8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<13>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-NEXT:    .reg .b64 %rd<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0: // %entry
+; O3-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_srem_v4i8_param_1];
+; O3-NEXT:    ld.b32 %r2, [%rd2];
+; O3-NEXT:    ld.param.b64 %rd3, [test_srem_v4i8_param_2];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0xbbb3U;
+; O3-NEXT:    cvt.u16.u32 %rs1, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0xbbb3U;
+; O3-NEXT:    cvt.u16.u32 %rs2, %r4;
+; O3-NEXT:    rem.s16 %rs3, %rs2, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs3;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0xaaa2U;
+; O3-NEXT:    cvt.u16.u32 %rs4, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0xaaa2U;
+; O3-NEXT:    cvt.u16.u32 %rs5, %r7;
+; O3-NEXT:    rem.s16 %rs6, %rs5, %rs4;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs6;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r2, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs7, %r10;
+; O3-NEXT:    prmt.b32 %r11, %r1, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs8, %r11;
+; O3-NEXT:    rem.s16 %rs9, %rs8, %rs7;
+; O3-NEXT:    cvt.u32.u16 %r12, %rs9;
+; O3-NEXT:    prmt.b32 %r13, %r2, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs10, %r13;
+; O3-NEXT:    prmt.b32 %r14, %r1, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r14;
+; O3-NEXT:    rem.s16 %rs12, %rs11, %rs10;
+; O3-NEXT:    cvt.u32.u16 %r15, %rs12;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
+; O3-NEXT:    st.b32 [%rd3], %r17;
+; O3-NEXT:    ret;
 entry:
   %t57 = load <4 x i8>, ptr %a, align 4
   %t59 = load <4 x i8>, ptr %b, align 4
@@ -1328,52 +2135,97 @@ entry:
 ;; Ideally we want to split it into element-wise ops, but legalizer can't handle
 ;; odd-sized vectors.  TL;DR; don't use odd-sized vectors of v8.
 define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: test_srem_v3i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_srem_v3i8_param_2];
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_srem_v3i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v3i8_param_0];
-; CHECK-NEXT:    ld.b8 %rs1, [%rd1];
-; CHECK-NEXT:    ld.b8 %rs2, [%rd1+1];
-; CHECK-NEXT:    shl.b16 %rs3, %rs2, 8;
-; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs1;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT:    ld.s8 %rs5, [%rd1+2];
-; CHECK-NEXT:    ld.b8 %rs6, [%rd2];
-; CHECK-NEXT:    ld.b8 %rs7, [%rd2+1];
-; CHECK-NEXT:    shl.b16 %rs8, %rs7, 8;
-; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r2, %rs9;
-; CHECK-NEXT:    ld.s8 %rs10, [%rd2+2];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r3;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
-; CHECK-NEXT:    cvt.u16.u32 %rs12, %r4;
-; CHECK-NEXT:    rem.s16 %rs13, %rs12, %rs11;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs13;
-; CHECK-NEXT:    prmt.b32 %r6, %r2, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs14, %r6;
-; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x8880U;
-; CHECK-NEXT:    cvt.u16.u32 %rs15, %r7;
-; CHECK-NEXT:    rem.s16 %rs16, %rs15, %rs14;
-; CHECK-NEXT:    cvt.u32.u16 %r8, %rs16;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
-; CHECK-NEXT:    // implicit-def: %r11
-; CHECK-NEXT:    // implicit-def: %r12
-; CHECK-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
-; CHECK-NEXT:    rem.s16 %rs17, %rs5, %rs10;
-; CHECK-NEXT:    mov.b32 {%rs18, _}, %r13;
-; CHECK-NEXT:    st.b8 [%rd3], %rs18;
-; CHECK-NEXT:    shr.u16 %rs19, %rs18, 8;
-; CHECK-NEXT:    st.b8 [%rd3+1], %rs19;
-; CHECK-NEXT:    st.b8 [%rd3+2], %rs17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_srem_v3i8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<20>;
+; O0-NEXT:    .reg .b32 %r<14>;
+; O0-NEXT:    .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0: // %entry
+; O0-NEXT:    ld.param.b64 %rd3, [test_srem_v3i8_param_2];
+; O0-NEXT:    ld.param.b64 %rd2, [test_srem_v3i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_srem_v3i8_param_0];
+; O0-NEXT:    ld.b8 %rs1, [%rd1];
+; O0-NEXT:    ld.b8 %rs2, [%rd1+1];
+; O0-NEXT:    shl.b16 %rs3, %rs2, 8;
+; O0-NEXT:    or.b16 %rs4, %rs3, %rs1;
+; O0-NEXT:    cvt.u32.u16 %r1, %rs4;
+; O0-NEXT:    ld.s8 %rs5, [%rd1+2];
+; O0-NEXT:    ld.b8 %rs6, [%rd2];
+; O0-NEXT:    ld.b8 %rs7, [%rd2+1];
+; O0-NEXT:    shl.b16 %rs8, %rs7, 8;
+; O0-NEXT:    or.b16 %rs9, %rs8, %rs6;
+; O0-NEXT:    cvt.u32.u16 %r2, %rs9;
+; O0-NEXT:    ld.s8 %rs10, [%rd2+2];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs11, %r3;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
+; O0-NEXT:    cvt.u16.u32 %rs12, %r4;
+; O0-NEXT:    rem.s16 %rs13, %rs12, %rs11;
+; O0-NEXT:    cvt.u32.u16 %r5, %rs13;
+; O0-NEXT:    prmt.b32 %r6, %r2, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs14, %r6;
+; O0-NEXT:    prmt.b32 %r7, %r1, 0, 0x8880U;
+; O0-NEXT:    cvt.u16.u32 %rs15, %r7;
+; O0-NEXT:    rem.s16 %rs16, %rs15, %rs14;
+; O0-NEXT:    cvt.u32.u16 %r8, %rs16;
+; O0-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O0-NEXT:    // implicit-def: %r11
+; O0-NEXT:    // implicit-def: %r12
+; O0-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
+; O0-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
+; O0-NEXT:    rem.s16 %rs17, %rs5, %rs10;
+; O0-NEXT:    cvt.u16.u32 %rs18, %r13;
+; O0-NEXT:    st.b8 [%rd3], %rs18;
+; O0-NEXT:    shr.u16 %rs19, %rs18, 8;
+; O0-NEXT:    st.b8 [%rd3+1], %rs19;
+; O0-NEXT:    st.b8 [%rd3+2], %rs17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_srem_v3i8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<20>;
+; O3-NEXT:    .reg .b32 %r<14>;
+; O3-NEXT:    .reg .b64 %rd<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0: // %entry
+; O3-NEXT:    ld.param.b64 %rd1, [test_srem_v3i8_param_0];
+; O3-NEXT:    ld.b8 %rs1, [%rd1];
+; O3-NEXT:    ld.b8 %rs2, [%rd1+1];
+; O3-NEXT:    shl.b16 %rs3, %rs2, 8;
+; O3-NEXT:    or.b16 %rs4, %rs3, %rs1;
+; O3-NEXT:    cvt.u32.u16 %r1, %rs4;
+; O3-NEXT:    ld.s8 %rs5, [%rd1+2];
+; O3-NEXT:    ld.param.b64 %rd2, [test_srem_v3i8_param_1];
+; O3-NEXT:    ld.b8 %rs6, [%rd2];
+; O3-NEXT:    ld.b8 %rs7, [%rd2+1];
+; O3-NEXT:    shl.b16 %rs8, %rs7, 8;
+; O3-NEXT:    or.b16 %rs9, %rs8, %rs6;
+; O3-NEXT:    cvt.u32.u16 %r2, %rs9;
+; O3-NEXT:    ld.s8 %rs10, [%rd2+2];
+; O3-NEXT:    ld.param.b64 %rd3, [test_srem_v3i8_param_2];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs11, %r3;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x9991U;
+; O3-NEXT:    cvt.u16.u32 %rs12, %r4;
+; O3-NEXT:    rem.s16 %rs13, %rs12, %rs11;
+; O3-NEXT:    cvt.u32.u16 %r5, %rs13;
+; O3-NEXT:    prmt.b32 %r6, %r2, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs14, %r6;
+; O3-NEXT:    prmt.b32 %r7, %r1, 0, 0x8880U;
+; O3-NEXT:    cvt.u16.u32 %rs15, %r7;
+; O3-NEXT:    rem.s16 %rs16, %rs15, %rs14;
+; O3-NEXT:    cvt.u32.u16 %r8, %rs16;
+; O3-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; O3-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
+; O3-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
+; O3-NEXT:    rem.s16 %rs17, %rs5, %rs10;
+; O3-NEXT:    st.b8 [%rd3+2], %rs17;
+; O3-NEXT:    cvt.u16.u32 %rs18, %r13;
+; O3-NEXT:    st.b8 [%rd3], %rs18;
+; O3-NEXT:    shr.u16 %rs19, %rs18, 8;
+; O3-NEXT:    st.b8 [%rd3+1], %rs19;
+; O3-NEXT:    ret;
 entry:
   %t57 = load <3 x i8>, ptr %a, align 1
   %t59 = load <3 x i8>, ptr %b, align 1
@@ -1383,39 +2235,73 @@ entry:
 }
 
 define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
-; CHECK-LABEL: test_sext_v4i1_to_v4i8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
-; CHECK-NEXT:    ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
-; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    ld.b32 %r2, [%rd2];
-; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
-; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
-; CHECK-NEXT:    setp.gt.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
-; CHECK-NEXT:    setp.gt.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
-; CHECK-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
-; CHECK-NEXT:    setp.gt.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
-; CHECK-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
-; CHECK-NEXT:    setp.gt.u32 %p4, %r10, %r9;
-; CHECK-NEXT:    selp.b32 %r11, -1, 0, %p4;
-; CHECK-NEXT:    selp.b32 %r12, -1, 0, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
-; CHECK-NEXT:    selp.b32 %r14, -1, 0, %p2;
-; CHECK-NEXT:    selp.b32 %r15, -1, 0, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.b32 [%rd3], %r17;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_sext_v4i1_to_v4i8(
+; O0:       {
+; O0-NEXT:    .reg .pred %p<5>;
+; O0-NEXT:    .reg .b32 %r<18>;
+; O0-NEXT:    .reg .b64 %rd<4>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0: // %entry
+; O0-NEXT:    ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
+; O0-NEXT:    ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
+; O0-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
+; O0-NEXT:    ld.b32 %r1, [%rd1];
+; O0-NEXT:    ld.b32 %r2, [%rd2];
+; O0-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O0-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O0-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O0-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O0-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O0-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O0-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O0-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O0-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O0-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O0-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O0-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O0-NEXT:    selp.b32 %r11, -1, 0, %p4;
+; O0-NEXT:    selp.b32 %r12, -1, 0, %p3;
+; O0-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O0-NEXT:    selp.b32 %r14, -1, 0, %p2;
+; O0-NEXT:    selp.b32 %r15, -1, 0, %p1;
+; O0-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O0-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O0-NEXT:    st.b32 [%rd3], %r17;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_sext_v4i1_to_v4i8(
+; O3:       {
+; O3-NEXT:    .reg .pred %p<5>;
+; O3-NEXT:    .reg .b32 %r<18>;
+; O3-NEXT:    .reg .b64 %rd<4>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0: // %entry
+; O3-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
+; O3-NEXT:    ld.b32 %r1, [%rd1];
+; O3-NEXT:    ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
+; O3-NEXT:    ld.b32 %r2, [%rd2];
+; O3-NEXT:    ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
+; O3-NEXT:    prmt.b32 %r3, %r2, 0, 0x7770U;
+; O3-NEXT:    prmt.b32 %r4, %r1, 0, 0x7770U;
+; O3-NEXT:    setp.gt.u32 %p1, %r4, %r3;
+; O3-NEXT:    prmt.b32 %r5, %r2, 0, 0x7771U;
+; O3-NEXT:    prmt.b32 %r6, %r1, 0, 0x7771U;
+; O3-NEXT:    setp.gt.u32 %p2, %r6, %r5;
+; O3-NEXT:    prmt.b32 %r7, %r2, 0, 0x7772U;
+; O3-NEXT:    prmt.b32 %r8, %r1, 0, 0x7772U;
+; O3-NEXT:    setp.gt.u32 %p3, %r8, %r7;
+; O3-NEXT:    prmt.b32 %r9, %r2, 0, 0x7773U;
+; O3-NEXT:    prmt.b32 %r10, %r1, 0, 0x7773U;
+; O3-NEXT:    setp.gt.u32 %p4, %r10, %r9;
+; O3-NEXT:    selp.b32 %r11, -1, 0, %p4;
+; O3-NEXT:    selp.b32 %r12, -1, 0, %p3;
+; O3-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
+; O3-NEXT:    selp.b32 %r14, -1, 0, %p2;
+; O3-NEXT:    selp.b32 %r15, -1, 0, %p1;
+; O3-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
+; O3-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
+; O3-NEXT:    st.b32 [%rd3], %r17;
+; O3-NEXT:    ret;
 entry:
   %t1 = load <4 x i8>, ptr %a, align 4
   %t2 = load <4 x i8>, ptr %b, align 4
diff --git a/llvm/test/CodeGen/NVPTX/no-stack-protector-libcall-error.ll b/llvm/test/CodeGen/NVPTX/no-stack-protector-libcall-error.ll
new file mode 100644
index 0000000000000..f877d95dd3769
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/no-stack-protector-libcall-error.ll
@@ -0,0 +1,10 @@
+; RUN: not opt -disable-output -mtriple=nvptx64-- -enable-selectiondag-sp=0 -passes=stack-protector %s 2>&1 | FileCheck %s
+
+; CHECK: error: no libcall available for stack protector
+define void @func() sspreq nounwind {
+  %alloca = alloca i32, align 4
+  call void @capture(ptr %alloca)
+  ret void
+}
+
+declare void @capture(ptr)
diff --git a/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll b/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll
new file mode 100644
index 0000000000000..7afead127c84f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -verify-machineinstrs -mcpu=sm_50 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+@g = global i32 0
+
+define void @test_prmt_f4e() {
+; CHECK-LABEL: test_prmt_f4e(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 67305985;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 84148994;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 100992003;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x4)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
+
+define void @test_prmt_b4e() {
+; CHECK-LABEL: test_prmt_b4e(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 84281088;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 101122049;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 117440770;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 66051;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 84281088;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x4)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
+
+define void @test_prmt_ecl() {
+; CHECK-LABEL: test_prmt_ecl(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462977;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50463234;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50529027;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  ret void
+}
+
+define void @test_prmt_ecr() {
+; CHECK-LABEL: test_prmt_ecr(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16843008;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 33685760;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50462976;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  ret void
+}
+
+define void @test_prmt_rc8() {
+; CHECK-LABEL: test_prmt_rc8(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16843009;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 33686018;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50529027;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x3)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x4)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
+
+define void @test_prmt_rc16() {
+; CHECK-LABEL: test_prmt_rc16(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16777472;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 50463490;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 16777472;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x0)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x1)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x2)
+  store volatile i32 %v3, ptr @g
+  ret void
+}
+
+
+define void @test_prmt_basic() {
+; CHECK-LABEL: test_prmt_basic(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.volatile.global.b32 [g], 66051;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 117507841;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 1146447479;
+; CHECK-NEXT:    st.volatile.global.b32 [g], 0;
+; CHECK-NEXT:    st.volatile.global.b32 [g], -16711936;
+; CHECK-NEXT:    ret;
+  %v1 = call i32 @llvm.nvvm.prmt(i32 u0x03020100, i32 u0x07060504, i32 u0x0123)
+  store volatile i32 %v1, ptr @g
+  %v2 = call i32 @llvm.nvvm.prmt(i32 u0x03020100, i32 u0x07060504, i32 u0x7171)
+  store volatile i32 %v2, ptr @g
+  %v3 = call i32 @llvm.nvvm.prmt(i32 u0x33221100, i32 u0x77665544, i32 u0x4567)
+  store volatile i32 %v3, ptr @g
+  %v4 = call i32 @llvm.nvvm.prmt(i32 u0x33221100, i32 u0x77665544, i32 u0xBA98)
+  store volatile i32 %v4, ptr @g
+  %v5 = call i32 @llvm.nvvm.prmt(i32 u0xF322F100, i32 u0x77665544, i32 u0xBA98)
+  store volatile i32 %v5, ptr @g
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll b/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll
new file mode 100644
index 0000000000000..06c8f9a3b4bb6
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll
@@ -0,0 +1,8 @@
+; Adding -enable-matrix, which is disabled by default, forces the initialization
+; of the PPCSubtarget which verifies the incompatible CPU features.
+; RUN: not llc -mtriple=powerpcspe -mattr=+vsx -enable-matrix < %s 2>&1  | FileCheck %s
+
+; CHECK: SPE and traditional floating point cannot both be enabled
+define void @test() {
+    ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/stack-protector-target.ll b/llvm/test/CodeGen/PowerPC/stack-protector-target.ll
new file mode 100644
index 0000000000000..03ffa0b4c142b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/stack-protector-target.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc-unknown-openbsd < %s | FileCheck -check-prefix=OPENBSD32 %s
+; RUN: llc -mtriple=powerpc64-unknown-openbsd < %s | FileCheck -check-prefix=OPENBSD64 %s
+; RUN: llc -mtriple=powerpc-unknown-linux < %s | FileCheck -check-prefix=LINUX32 %s
+; RUN: llc -mtriple=powerpc64-unknown-linux < %s | FileCheck -check-prefix=LINUX64 %s
+; RUN: llc -mtriple=powerpc-ibm-aix-xcoff < %s | FileCheck -check-prefix=AIX32 %s
+; RUN: llc -mtriple=powerpc64-ibm-aix-xcoff < %s | FileCheck -check-prefix=AIX64 %s
+
+define void @func() sspreq nounwind {
+; OPENBSD32-LABEL: func:
+; OPENBSD32:       # %bb.0:
+; OPENBSD32-NEXT:    mflr 0
+; OPENBSD32-NEXT:    stwu 1, -32(1)
+; OPENBSD32-NEXT:    stw 0, 36(1)
+; OPENBSD32-NEXT:    stw 30, 24(1) # 4-byte Folded Spill
+; OPENBSD32-NEXT:    lis 30, __guard_local@ha
+; OPENBSD32-NEXT:    lwz 3, __guard_local@l(30)
+; OPENBSD32-NEXT:    stw 3, 20(1)
+; OPENBSD32-NEXT:    addi 3, 1, 16
+; OPENBSD32-NEXT:    bl capture
+; OPENBSD32-NEXT:    lwz 3, __guard_local@l(30)
+; OPENBSD32-NEXT:    lwz 4, 20(1)
+; OPENBSD32-NEXT:    cmplw 3, 4
+; OPENBSD32-NEXT:    bne- 0, .LBB0_2
+; OPENBSD32-NEXT:  # %bb.1: # %SP_return
+; OPENBSD32-NEXT:    lwz 30, 24(1) # 4-byte Folded Reload
+; OPENBSD32-NEXT:    lwz 0, 36(1)
+; OPENBSD32-NEXT:    addi 1, 1, 32
+; OPENBSD32-NEXT:    mtlr 0
+; OPENBSD32-NEXT:    blr
+; OPENBSD32-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; OPENBSD32-NEXT:    lis 3, .LSSH@ha
+; OPENBSD32-NEXT:    la 3, .LSSH@l(3)
+; OPENBSD32-NEXT:    bl __stack_smash_handler
+;
+; OPENBSD64-LABEL: func:
+; OPENBSD64:       # %bb.0:
+; OPENBSD64-NEXT:    mflr 0
+; OPENBSD64-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; OPENBSD64-NEXT:    stdu 1, -64(1)
+; OPENBSD64-NEXT:    std 0, 80(1)
+; OPENBSD64-NEXT:    addis 30, 2, __guard_local@toc@ha
+; OPENBSD64-NEXT:    ld 3, __guard_local@toc@l(30)
+; OPENBSD64-NEXT:    std 3, 40(1)
+; OPENBSD64-NEXT:    addi 3, 1, 36
+; OPENBSD64-NEXT:    bl capture
+; OPENBSD64-NEXT:    nop
+; OPENBSD64-NEXT:    ld 3, __guard_local@toc@l(30)
+; OPENBSD64-NEXT:    ld 4, 40(1)
+; OPENBSD64-NEXT:    cmpld 3, 4
+; OPENBSD64-NEXT:    bne- 0, .LBB0_2
+; OPENBSD64-NEXT:  # %bb.1: # %SP_return
+; OPENBSD64-NEXT:    addi 1, 1, 64
+; OPENBSD64-NEXT:    ld 0, 16(1)
+; OPENBSD64-NEXT:    mtlr 0
+; OPENBSD64-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; OPENBSD64-NEXT:    blr
+; OPENBSD64-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; OPENBSD64-NEXT:    addis 3, 2, .LSSH@toc@ha
+; OPENBSD64-NEXT:    addi 3, 3, .LSSH@toc@l
+; OPENBSD64-NEXT:    bl __stack_smash_handler
+; OPENBSD64-NEXT:    nop
+;
+; LINUX32-LABEL: func:
+; LINUX32:       # %bb.0:
+; LINUX32-NEXT:    mflr 0
+; LINUX32-NEXT:    stwu 1, -16(1)
+; LINUX32-NEXT:    stw 0, 20(1)
+; LINUX32-NEXT:    lwz 3, -28680(2)
+; LINUX32-NEXT:    stw 3, 12(1)
+; LINUX32-NEXT:    addi 3, 1, 8
+; LINUX32-NEXT:    bl capture
+; LINUX32-NEXT:    lwz 3, 12(1)
+; LINUX32-NEXT:    lwz 4, -28680(2)
+; LINUX32-NEXT:    cmplw 4, 3
+; LINUX32-NEXT:    bne 0, .LBB0_2
+; LINUX32-NEXT:  # %bb.1:
+; LINUX32-NEXT:    lwz 0, 20(1)
+; LINUX32-NEXT:    addi 1, 1, 16
+; LINUX32-NEXT:    mtlr 0
+; LINUX32-NEXT:    blr
+; LINUX32-NEXT:  .LBB0_2:
+; LINUX32-NEXT:    bl __stack_chk_fail
+;
+; LINUX64-LABEL: func:
+; LINUX64:       # %bb.0:
+; LINUX64-NEXT:    mflr 0
+; LINUX64-NEXT:    stdu 1, -128(1)
+; LINUX64-NEXT:    std 0, 144(1)
+; LINUX64-NEXT:    ld 3, -28688(13)
+; LINUX64-NEXT:    std 3, 120(1)
+; LINUX64-NEXT:    addi 3, 1, 116
+; LINUX64-NEXT:    bl capture
+; LINUX64-NEXT:    nop
+; LINUX64-NEXT:    ld 3, 120(1)
+; LINUX64-NEXT:    ld 4, -28688(13)
+; LINUX64-NEXT:    cmpld 4, 3
+; LINUX64-NEXT:    bne 0, .LBB0_2
+; LINUX64-NEXT:  # %bb.1:
+; LINUX64-NEXT:    addi 1, 1, 128
+; LINUX64-NEXT:    ld 0, 16(1)
+; LINUX64-NEXT:    mtlr 0
+; LINUX64-NEXT:    blr
+; LINUX64-NEXT:  .LBB0_2:
+; LINUX64-NEXT:    bl __stack_chk_fail
+; LINUX64-NEXT:    nop
+;
+; AIX32-LABEL: func:
+; AIX32:       # %bb.0:
+; AIX32-NEXT:    mflr 0
+; AIX32-NEXT:    stwu 1, -80(1)
+; AIX32-NEXT:    stw 0, 88(1)
+; AIX32-NEXT:    stw 31, 76(1) # 4-byte Folded Spill
+; AIX32-NEXT:    lwz 31, L..C0(2) # @__ssp_canary_word
+; AIX32-NEXT:    lwz 3, 0(31)
+; AIX32-NEXT:    stw 3, 72(1)
+; AIX32-NEXT:    addi 3, 1, 68
+; AIX32-NEXT:    bl .capture[PR]
+; AIX32-NEXT:    nop
+; AIX32-NEXT:    lwz 3, 0(31)
+; AIX32-NEXT:    lwz 4, 72(1)
+; AIX32-NEXT:    cmplw 3, 4
+; AIX32-NEXT:    bne 0, L..BB0_2
+; AIX32-NEXT:  # %bb.1:
+; AIX32-NEXT:    lwz 31, 76(1) # 4-byte Folded Reload
+; AIX32-NEXT:    addi 1, 1, 80
+; AIX32-NEXT:    lwz 0, 8(1)
+; AIX32-NEXT:    mtlr 0
+; AIX32-NEXT:    blr
+; AIX32-NEXT:  L..BB0_2:
+; AIX32-NEXT:    bl .__stack_chk_fail[PR]
+; AIX32-NEXT:    nop
+;
+; AIX64-LABEL: func:
+; AIX64:       # %bb.0:
+; AIX64-NEXT:    mflr 0
+; AIX64-NEXT:    stdu 1, -144(1)
+; AIX64-NEXT:    std 0, 160(1)
+; AIX64-NEXT:    std 31, 136(1) # 8-byte Folded Spill
+; AIX64-NEXT:    ld 31, L..C0(2) # @__ssp_canary_word
+; AIX64-NEXT:    ld 3, 0(31)
+; AIX64-NEXT:    std 3, 128(1)
+; AIX64-NEXT:    addi 3, 1, 124
+; AIX64-NEXT:    bl .capture[PR]
+; AIX64-NEXT:    nop
+; AIX64-NEXT:    ld 3, 0(31)
+; AIX64-NEXT:    ld 4, 128(1)
+; AIX64-NEXT:    cmpld 3, 4
+; AIX64-NEXT:    bne 0, L..BB0_2
+; AIX64-NEXT:  # %bb.1:
+; AIX64-NEXT:    ld 31, 136(1) # 8-byte Folded Reload
+; AIX64-NEXT:    addi 1, 1, 144
+; AIX64-NEXT:    ld 0, 16(1)
+; AIX64-NEXT:    mtlr 0
+; AIX64-NEXT:    blr
+; AIX64-NEXT:  L..BB0_2:
+; AIX64-NEXT:    bl .__stack_chk_fail[PR]
+; AIX64-NEXT:    nop
+  %alloca = alloca i32, align 4
+  call void @capture(ptr %alloca)
+  ret void
+}
+
+declare void @capture(ptr)
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 43e16606aae73..d566069f4200b 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -106,6 +106,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisync %s -o - | FileCheck --check-prefix=RV32XQCISYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV32XANDESPERF %s
+; RUN: llc -mtriple=riscv32 -mattr=+xandesbfhcvt %s -o - | FileCheck --check-prefix=RV32XANDESBFHCVT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV32XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvsintload %s -o - | FileCheck --check-prefix=RV32XANDESVSINTLOAD %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV32XANDESVDOT %s
@@ -262,6 +263,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV64XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV64XANDESPERF %s
+; RUN: llc -mtriple=riscv64 -mattr=+xandesbfhcvt %s -o - | FileCheck --check-prefix=RV64XANDESBFHCVT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV64XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvsintload %s -o - | FileCheck --check-prefix=RV64XANDESVSINTLOAD %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV64XANDESVDOT %s
@@ -462,6 +464,7 @@
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32XQCISYNC: attribute 5, "rv32i2p1_zca1p0_xqcisync0p3"
 ; RV32XANDESPERF: .attribute 5, "rv32i2p1_xandesperf5p0"
+; RV32XANDESBFHCVT: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_xandesbfhcvt5p0"
 ; RV32XANDESVBFHCVT: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV32XANDESVSINTLOAD: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvsintload5p0"
 ; RV32XANDESVDOT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
@@ -620,6 +623,7 @@
 ; RV64XTHEADSYNC: .attribute 5, "rv64i2p1_xtheadsync1p0"
 ; RV64XTHEADVDOT: .attribute 5, "rv64i2p1_f2p2_d2p2_v1p0_zicsr2p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xtheadvdot1p0"
 ; RV64XANDESPERF: .attribute 5, "rv64i2p1_xandesperf5p0"
+; RV64XANDESBFHCVT: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_xandesbfhcvt5p0"
 ; RV64XANDESVBFHCVT: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV64XANDESVSINTLOAD: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvsintload5p0"
 ; RV64XANDESVDOT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 26f5ab10f6c36..b94665b718ae7 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -173,6 +173,7 @@
 ; CHECK-NEXT:   ventana-veyron                   - Ventana Veyron-Series processors.
 ; CHECK-NEXT:   vl-dependent-latency             - Latency of vector instructions is dependent on the dynamic value of vl.
 ; CHECK-NEXT:   vxrm-pipeline-flush              - VXRM writes causes pipeline flush.
+; CHECK-NEXT:   xandesbfhcvt                     - 'XAndesBFHCvt' (Andes Scalar BFLOAT16 Conversion Extension).
 ; CHECK-NEXT:   xandesperf                       - 'XAndesPerf' (Andes Performance Extension).
 ; CHECK-NEXT:   xandesvbfhcvt                    - 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension).
 ; CHECK-NEXT:   xandesvdot                       - 'XAndesVDot' (Andes Vector Dot Product Extension).
diff --git a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
index 148186b21c125..255c120434f34 100644
--- a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
+++ b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
@@ -35,3 +35,37 @@ entry:
   call void @llvm.reset.fpenv()
   ret void
 }
+
+define iXLen @func_get_fpmode() {
+; CHECK-LABEL: func_get_fpmode:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frcsr a0
+; CHECK-NEXT:    ret
+entry:
+  %fpenv = call iXLen @llvm.get.fpmode.iXLen()
+  ret iXLen %fpenv
+}
+
+define void @func_set_fpmode(iXLen %fpmode) {
+; CHECK-LABEL: func_set_fpmode:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li    a1, -32
+; CHECK-NEXT:    csrc  fcsr, a1
+; CHECK-NEXT:    andi  a0, a0, -32
+; CHECK-NEXT:    csrs  fcsr, a0
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.set.fpmode.iXLen(iXLen %fpmode)
+  ret void
+}
+
+define void @func_reset_fpmode() {
+; CHECK-LABEL: func_reset_fpmode:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li   a0, -32
+; CHECK-NEXT:    csrc fcsr, a0
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.reset.fpmode()
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
index 0d8aff306252e..2d4fce68f9545 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
@@ -313,12 +313,12 @@ define i32 @test_nxv128i1(<vscale x 128 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v0, v6, a0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v7, a1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a0
 ; CHECK-NEXT:    vslidedown.vx v5, v6, a0
-; CHECK-NEXT:    vslidedown.vx v4, v7, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v5
 ; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
@@ -425,13 +425,15 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v5, a1
-; CHECK-NEXT:    vslidedown.vx v5, v7, a1
-; CHECK-NEXT:    vslidedown.vx v4, v6, a1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v6, a1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    addi a2, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
index 796f8dde58f47..15417da962bd3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
@@ -139,21 +139,20 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    sub sp, sp, a3
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    addi a3, sp, 64
 ; RV32-NEXT:    vl8r.v v8, (a0)
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    vl8r.v v24, (a0)
+; RV32-NEXT:    vl8r.v v16, (a0)
 ; RV32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; RV32-NEXT:    vmseq.vi v0, v8, 0
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    vmseq.vi v8, v24, 0
-; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vs8r.v v24, (a3)
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vs8r.v v24, (a0)
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-NEXT:    vs8r.v v8, (a2)
 ; RV32-NEXT:    lbu a0, 0(a1)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -179,21 +178,20 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    addi a3, sp, 64
 ; RV64-NEXT:    vl8r.v v8, (a0)
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    vl8r.v v24, (a0)
+; RV64-NEXT:    vl8r.v v16, (a0)
 ; RV64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; RV64-NEXT:    vmseq.vi v0, v8, 0
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    add a1, a3, a1
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    vmseq.vi v8, v24, 0
-; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vs8r.v v24, (a3)
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV64-NEXT:    vmseq.vi v0, v16, 0
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vs8r.v v24, (a0)
+; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-NEXT:    vs8r.v v8, (a2)
 ; RV64-NEXT:    lbu a0, 0(a1)
 ; RV64-NEXT:    addi sp, s0, -80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 3e822d357b667..807651c9b40c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -274,6 +274,59 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
 }
 
+define { <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_partial(ptr %p) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_partial:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    ret
+  %vec = load <24 x i8>, ptr %p
+  %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
+  %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+  %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+  %res0 = insertvalue { <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+  %res1 = insertvalue { <8 x i8>, <8 x i8> } %res0, <8 x i8> %t2, 1
+  ret { <8 x i8>, <8 x i8> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue.
+define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_no_extract(ptr %p, ptr %p1, i1 %c) {
+; CHECK-LABEL: vector_deinterleave_load_factor3_no_extract:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a2, a2, 1
+; CHECK-NEXT:    beqz a2, .LBB17_2
+; CHECK-NEXT:  # %bb.1: # %bb0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB17_2: # %bb1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vlseg3e8.v v6, (a1)
+; CHECK-NEXT:    ret
+  br i1 %c, label %bb0, label %bb1
+
+bb0:
+  %vec0 = load <24 x i8>, ptr %p
+  %d0.0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec0)
+  br label %merge
+
+bb1:
+  %vec1 = load <24 x i8>, ptr %p1
+  %d0.1 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec1)
+  br label %merge
+
+merge:
+  %d0 = phi {<8 x i8>, <8 x i8>, <8 x i8>} [%d0.0, %bb0], [%d0.1, %bb1]
+  %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
+  %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
+  %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
+  %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
+  %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
+  %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
+}
+
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_factor4:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 2587411566a3f..fb070b24a4f34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -324,24 +324,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    zext.b a1, a1
-; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    li a3, 128
-; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32-NEXT:    li a2, 128
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle8.v v16, (a0)
-; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    vmseq.vi v0, v8, 0
-; RV32-NEXT:    vmv.v.i v24, 0
-; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV32-NEXT:    vse8.v v16, (a2)
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vse8.v v8, (a0)
-; RV32-NEXT:    lbu a0, 0(a1)
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    zext.b a0, a1
+; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    vse8.v v24, (a1)
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    vse8.v v8, (a1)
+; RV32-NEXT:    lbu a0, 0(a0)
 ; RV32-NEXT:    addi sp, s0, -384
 ; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
@@ -355,24 +354,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    zext.b a1, a1
-; RV64-NEXT:    mv a2, sp
-; RV64-NEXT:    li a3, 128
-; RV64-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64-NEXT:    li a2, 128
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a0)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle8.v v16, (a0)
-; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    vmseq.vi v0, v8, 0
-; RV64-NEXT:    vmv.v.i v24, 0
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV64-NEXT:    vse8.v v16, (a2)
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vse8.v v8, (a0)
-; RV64-NEXT:    lbu a0, 0(a1)
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV64-NEXT:    vmseq.vi v0, v16, 0
+; RV64-NEXT:    zext.b a0, a1
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    vse8.v v24, (a1)
+; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV64-NEXT:    addi a1, sp, 128
+; RV64-NEXT:    vse8.v v8, (a1)
+; RV64-NEXT:    lbu a0, 0(a0)
 ; RV64-NEXT:    addi sp, s0, -384
 ; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
@@ -386,24 +384,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32ZBS-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
 ; RV32ZBS-NEXT:    addi s0, sp, 384
 ; RV32ZBS-NEXT:    andi sp, sp, -128
-; RV32ZBS-NEXT:    zext.b a1, a1
-; RV32ZBS-NEXT:    mv a2, sp
-; RV32ZBS-NEXT:    li a3, 128
-; RV32ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32ZBS-NEXT:    li a2, 128
+; RV32ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32ZBS-NEXT:    vle8.v v8, (a0)
 ; RV32ZBS-NEXT:    addi a0, a0, 128
 ; RV32ZBS-NEXT:    vle8.v v16, (a0)
-; RV32ZBS-NEXT:    add a1, a2, a1
 ; RV32ZBS-NEXT:    vmseq.vi v0, v8, 0
-; RV32ZBS-NEXT:    vmv.v.i v24, 0
-; RV32ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV32ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV32ZBS-NEXT:    vse8.v v16, (a2)
-; RV32ZBS-NEXT:    vmv1r.v v0, v8
-; RV32ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV32ZBS-NEXT:    addi a0, sp, 128
-; RV32ZBS-NEXT:    vse8.v v8, (a0)
-; RV32ZBS-NEXT:    lbu a0, 0(a1)
+; RV32ZBS-NEXT:    vmv.v.i v8, 0
+; RV32ZBS-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV32ZBS-NEXT:    vmseq.vi v0, v16, 0
+; RV32ZBS-NEXT:    zext.b a0, a1
+; RV32ZBS-NEXT:    mv a1, sp
+; RV32ZBS-NEXT:    add a0, a1, a0
+; RV32ZBS-NEXT:    vse8.v v24, (a1)
+; RV32ZBS-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV32ZBS-NEXT:    addi a1, sp, 128
+; RV32ZBS-NEXT:    vse8.v v8, (a1)
+; RV32ZBS-NEXT:    lbu a0, 0(a0)
 ; RV32ZBS-NEXT:    addi sp, s0, -384
 ; RV32ZBS-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
 ; RV32ZBS-NEXT:    lw s0, 376(sp) # 4-byte Folded Reload
@@ -417,24 +414,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64ZBS-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
 ; RV64ZBS-NEXT:    addi s0, sp, 384
 ; RV64ZBS-NEXT:    andi sp, sp, -128
-; RV64ZBS-NEXT:    zext.b a1, a1
-; RV64ZBS-NEXT:    mv a2, sp
-; RV64ZBS-NEXT:    li a3, 128
-; RV64ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64ZBS-NEXT:    li a2, 128
+; RV64ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64ZBS-NEXT:    vle8.v v8, (a0)
 ; RV64ZBS-NEXT:    addi a0, a0, 128
 ; RV64ZBS-NEXT:    vle8.v v16, (a0)
-; RV64ZBS-NEXT:    add a1, a2, a1
 ; RV64ZBS-NEXT:    vmseq.vi v0, v8, 0
-; RV64ZBS-NEXT:    vmv.v.i v24, 0
-; RV64ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV64ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
-; RV64ZBS-NEXT:    vse8.v v16, (a2)
-; RV64ZBS-NEXT:    vmv1r.v v0, v8
-; RV64ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
-; RV64ZBS-NEXT:    addi a0, sp, 128
-; RV64ZBS-NEXT:    vse8.v v8, (a0)
-; RV64ZBS-NEXT:    lbu a0, 0(a1)
+; RV64ZBS-NEXT:    vmv.v.i v8, 0
+; RV64ZBS-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV64ZBS-NEXT:    vmseq.vi v0, v16, 0
+; RV64ZBS-NEXT:    zext.b a0, a1
+; RV64ZBS-NEXT:    mv a1, sp
+; RV64ZBS-NEXT:    add a0, a1, a0
+; RV64ZBS-NEXT:    vse8.v v24, (a1)
+; RV64ZBS-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV64ZBS-NEXT:    addi a1, sp, 128
+; RV64ZBS-NEXT:    vse8.v v8, (a1)
+; RV64ZBS-NEXT:    lbu a0, 0(a0)
 ; RV64ZBS-NEXT:    addi sp, s0, -384
 ; RV64ZBS-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
 ; RV64ZBS-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 0c30cbe4a42ef..9df71cfc96cc7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -5707,3 +5707,197 @@ define void @msub_vv_v2i64_2(ptr %x, <2 x i64> %y) {
   store <2 x i64> %c, ptr %x
   ret void
 }
+
+define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <8 x i8> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i8> zeroinitializer, <8 x i8> %vb
+  %sub = sub nuw <8 x i8> %va, %select
+  ret <8 x i8> %sub
+}
+
+define <8 x i8> @vsub_if_uge_swapped_v8i8(<8 x i8> %va, <8 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <8 x i8> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i8> %vb, <8 x i8> zeroinitializer
+  %sub = sub nuw <8 x i8> %va, %select
+  ret <8 x i8> %sub
+}
+
+define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <8 x i16> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vb
+  %sub = sub nuw <8 x i16> %va, %select
+  ret <8 x i16> %sub
+}
+
+define <8 x i16> @vsub_if_uge_swapped_v8i16(<8 x i16> %va, <8 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <8 x i16> %va, %vb
+  %select = select <8 x i1> %cmp, <8 x i16> %vb, <8 x i16> zeroinitializer
+  %sub = sub nuw <8 x i16> %va, %select
+  ret <8 x i16> %sub
+}
+
+define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <4 x i32> %va, %vb
+  %select = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vb
+  %sub = sub nuw <4 x i32> %va, %select
+  ret <4 x i32> %sub
+}
+
+define <4 x i32> @vsub_if_uge_swapped_v4i32(<4 x i32> %va, <4 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <4 x i32> %va, %vb
+  %select = select <4 x i1> %cmp, <4 x i32> %vb, <4 x i32> zeroinitializer
+  %sub = sub nuw <4 x i32> %va, %select
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <2 x i64> %va, %vb
+  %select = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vb
+  %sub = sub nuw <2 x i64> %va, %select
+  ret <2 x i64> %sub
+}
+
+define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <2 x i64> %va, %vb
+  %select = select <2 x i1> %cmp, <2 x i64> %vb, <2 x i64> zeroinitializer
+  %sub = sub nuw <2 x i64> %va, %select
+  ret <2 x i64> %sub
+}
+
+define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: sub_if_uge_C_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v8, -13
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <8 x i8> %x, splat (i8 12)
+  %sub = add <8 x i8> %x, splat (i8 -13)
+  %select = select <8 x i1> %cmp, <8 x i8> %sub, <8 x i8> %x
+  ret <8 x i8> %select
+}
+
+define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) {
+; CHECK-LABEL: sub_if_uge_C_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, -2001
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <8 x i16> %x, splat (i16 2000)
+  %sub = add <8 x i16> %x, splat (i16 -2001)
+  %select = select <8 x i1> %cmp, <8 x i16> %sub, <8 x i16> %x
+  ret <8 x i16> %select
+}
+
+define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <4 x i32> %x, splat (i32 65520)
+  %sub = add <4 x i32> %x, splat (i32 -65521)
+  %select = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %x
+  ret <4 x i32> %select
+}
+
+define <4 x i32> @sub_if_uge_C_swapped_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <4 x i32> %x, splat (i32 65521)
+  %sub = add <4 x i32> %x, splat (i32 -65521)
+  %select = select <4 x i1> %cmp, <4 x i32> %x, <4 x i32> %sub
+  ret <4 x i32> %select
+}
+
+define <2 x i64> @sub_if_uge_C_v2i64(<2 x i64> %x) nounwind {
+; RV32-LABEL: sub_if_uge_C_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    li a0, -2
+; RV32-NEXT:    lui a1, 876449
+; RV32-NEXT:    addi a1, a1, -513
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vadd.vv v9, v8, v9
+; RV32-NEXT:    vminu.vv v8, v9, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sub_if_uge_C_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a0, 1048278
+; RV64-NEXT:    addi a0, a0, -95
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a0, a0, -513
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vadd.vx v9, v8, a0
+; RV64-NEXT:    vminu.vv v8, v9, v8
+; RV64-NEXT:    ret
+  %cmp = icmp ugt <2 x i64> %x, splat (i64 5000000000)
+  %sub = add <2 x i64> %x, splat (i64 -5000000001)
+  %select = select <2 x i1> %cmp, <2 x i64> %sub, <2 x i64> %x
+  ret <2 x i64> %select
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 041aae229288f..dbc8e891ab5f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1718,6 +1718,30 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
   ret void
 }
 
+define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) {
+; CHECK-LABEL: vp_load_factor3_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 12
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1>  splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  ret <4 x i32> %v0
+}
+
+define <4 x i32> @vp_load_factor5_one_active(ptr %ptr) {
+; CHECK-LABEL: vp_load_factor5_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 20
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+ %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1>  splat (i1 true), i32 20)
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  ret <4 x i32> %v0
+}
+
 define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
 ; CHECK-LABEL: store_factor4_one_active:
 ; CHECK:       # %bb.0:
@@ -1804,8 +1828,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI51_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI51_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI53_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI53_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -1880,8 +1904,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI52_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI52_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI54_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI54_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index c11319ff335fd..67584ba8a82cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -143,16 +143,15 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) {
 ; CHECK-LABEL: deinterleave6_0_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vmv.v.i v0, 2
-; CHECK-NEXT:    vmv.v.i v8, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v9, v9, 5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vrgather.vi v9, v10, 4, v0.t
-; CHECK-NEXT:    vse8.v v9, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vi v8, v9, 4, v0.t
+; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -188,16 +187,15 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) {
 ; CHECK-LABEL: deinterleave7_0_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vmv.v.i v0, 2
-; CHECK-NEXT:    vmv.v.i v8, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 8
+; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v9, v9, 6, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vrgather.vi v9, v10, 6, v0.t
-; CHECK-NEXT:    vse8.v v9, (a1)
+; CHECK-NEXT:    vslidedown.vi v8, v8, 6, v0.t
+; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vi v8, v9, 6, v0.t
+; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index 7afd31fdd663c..a04e31a19a4f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -439,9 +439,10 @@ define <256 x i8> @vsadd_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
 define <256 x i8> @vsadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK-LABEL: vsadd_vi_v258i8_evl128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
index f61b112fd8024..5556b11e9a90c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -435,9 +435,10 @@ define <256 x i8> @vsaddu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
 define <256 x i8> @vsaddu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK-LABEL: vsaddu_vi_v258i8_evl128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index 6ddf2e464750e..c28317bf14269 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -454,14 +454,15 @@ define <256 x i8> @vssub_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
 define <256 x i8> @vssub_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK-LABEL: vssub_vi_v258i8_evl128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a0)
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vssub.vx v16, v16, a1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128)
   ret <256 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index c403593894794..cbfe1292877ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -449,14 +449,15 @@ define <256 x i8> @vssubu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
 define <256 x i8> @vssubu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
 ; CHECK-LABEL: vssubu_vi_v258i8_evl128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a0)
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vssubu.vx v16, v16, a1, v0.t
 ; CHECK-NEXT:    ret
   %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128)
   ret <256 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
new file mode 100644
index 0000000000000..cca00bf58063d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+define i32 @_ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_(<vscale x 4 x i32> %wide.load, <vscale x 4 x i1> %0, <vscale x 4 x i1> %1, <vscale x 4 x i1> %2, <vscale x 4 x i1> %3) #0 {
+; CHECK-LABEL: _ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv.v.i v14, 0
+; CHECK-NEXT:  .LBB0_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    vmv2r.v v16, v10
+; CHECK-NEXT:    vle32.v v16, (a0), v0.t
+; CHECK-NEXT:    vand.vi v16, v16, 1
+; CHECK-NEXT:    vmsne.vi v9, v16, 0
+; CHECK-NEXT:    vmand.mm v0, v8, v9
+; CHECK-NEXT:    vmerge.vim v12, v12, -1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vor.vi v14, v14, 1, v0.t
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    j .LBB0_1
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ 1, %vector.body ]
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %predphi88, %vector.body ]
+  %vec.phi81 = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %predphi93, %vector.body ]
+  %wide.load1 = load <vscale x 4 x i32>, ptr null, align 4
+  %4 = icmp slt <vscale x 4 x i32> %wide.load, zeroinitializer
+  %5 = icmp sgt <vscale x 4 x i32> %wide.load, zeroinitializer
+  %wide.masked.load82 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr null, i32 1, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> zeroinitializer)
+  %6 = icmp eq <vscale x 4 x i32> zeroinitializer, zeroinitializer
+  %7 = getelementptr i32, ptr null, i64 %index
+  %wide.masked.load83 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %7, i32 1, <vscale x 4 x i1> %0, <vscale x 4 x i32> zeroinitializer)
+  %8 = select <vscale x 4 x i1> %0, <vscale x 4 x i1> %0, <vscale x 4 x i1> zeroinitializer
+  %9 = trunc <vscale x 4 x i32> %wide.masked.load83 to <vscale x 4 x i1>
+  %narrow = select <vscale x 4 x i1> %0, <vscale x 4 x i1> %9, <vscale x 4 x i1> zeroinitializer
+  %10 = sext <vscale x 4 x i1> %narrow to <vscale x 4 x i32>
+  %predphi88 = or <vscale x 4 x i32> %vec.phi, %10
+  %11 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i32>
+  %predphi93 = or <vscale x 4 x i32> %vec.phi81, %11
+  %index.next = add i64 0, 1
+  br i1 false, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %12 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %vec.phi)
+  %13 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %vec.phi81)
+  ret i32 %13
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #2
+
+; uselistorder directives
+uselistorder ptr @llvm.masked.load.nxv4i32.p0, { 1, 0 }
+uselistorder ptr @llvm.vector.reduce.add.nxv4i32, { 1, 0 }
+
+attributes #0 = { "target-features"="+v" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index 317ad0c124e73..4883a4dcfcf67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -1033,9 +1033,8 @@ define <vscale x 4 x i1> @vmadc_vim(<vscale x 4 x i32> %a, <vscale x 4 x i1> %ma
 ;
 ; VLOPT-LABEL: vmadc_vim:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; VLOPT-NEXT:    vmadc.vim v11, v8, 5, v0
-; VLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; VLOPT-NEXT:    vmand.mm v0, v11, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 5, <vscale x 4 x i1> %mask, iXLen -1)
@@ -1054,9 +1053,8 @@ define <vscale x 4 x i1> @vmadc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %ma
 ;
 ; VLOPT-LABEL: vmadc_vxm:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; VLOPT-NEXT:    vmadc.vxm v11, v8, a0, v0
-; VLOPT-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; VLOPT-NEXT:    vmand.mm v0, v11, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
@@ -1075,9 +1073,8 @@ define <vscale x 4 x i1> @vmadc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %ma
 ;
 ; VLOPT-LABEL: vmadc_vvm:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; VLOPT-NEXT:    vmadc.vvm v11, v8, v12, v0
-; VLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; VLOPT-NEXT:    vmand.mm v0, v11, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.riscv.vmadc.carry.in.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, <vscale x 4 x i1> %mask, iXLen -1)
@@ -1096,9 +1093,8 @@ define <vscale x 4 x i1> @vmsbc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %ma
 ;
 ; VLOPT-LABEL: vmsbc_vvm:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; VLOPT-NEXT:    vmsbc.vvm v11, v8, v12, v0
-; VLOPT-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; VLOPT-NEXT:    vmand.mm v0, v11, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.borrow.in.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c, <vscale x 4 x i1> %mask, iXLen -1)
@@ -1117,9 +1113,8 @@ define <vscale x 4 x i1> @vmsbc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %ma
 ;
 ; VLOPT-LABEL: vmsbc_vxm:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; VLOPT-NEXT:    vmsbc.vxm v11, v8, a0, v0
-; VLOPT-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; VLOPT-NEXT:    vmand.mm v0, v11, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i1> @llvm.riscv.vmsbc.borrow.in.nxv4i32.i32(<vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
@@ -5413,9 +5408,8 @@ define <vscale x 4 x i32> @vsbc_vvm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %ma
 ;
 ; VLOPT-LABEL: vsbc_vvm:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vsbc.vvm v8, v8, v10, v0
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsbc.vvm v8, v8, v10, v0
 ; VLOPT-NEXT:    vadd.vv v8, v8, v12
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.nxv4i1(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, iXLen -1)
@@ -5434,9 +5428,8 @@ define <vscale x 4 x i32> @vsbc_vxm(<vscale x 4 x i32> %a, <vscale x 4 x i1> %ma
 ;
 ; VLOPT-LABEL: vsbc_vxm:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vsbc.vxm v8, v8, a0, v0
 ; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vsbc.vxm v8, v8, a0, v0
 ; VLOPT-NEXT:    vadd.vv v8, v8, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.nxv4i1(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %c, <vscale x 4 x i1> %mask, iXLen -1)
@@ -5455,9 +5448,8 @@ define <vscale x 4 x i32> @vfclass_v(<vscale x 4 x float> %a, <vscale x 4 x i32>
 ;
 ; VLOPT-LABEL: vfclass_v:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vfclass.v v8, v8
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vfclass.v v8, v8
 ; VLOPT-NEXT:    vadd.vv v8, v8, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vfclass.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x float> %a, iXLen -1)
@@ -5476,9 +5468,8 @@ define <vscale x 4 x i32> @vrgather_vi(<vscale x 4 x i32> %a, <vscale x 4 x i32>
 ;
 ; VLOPT-LABEL: vrgather_vi:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgather.vi v12, v8, 5
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vi v12, v8, 5
 ; VLOPT-NEXT:    vadd.vv v8, v12, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
@@ -5497,9 +5488,8 @@ define <vscale x 4 x i32> @vrgather_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32>
 ;
 ; VLOPT-LABEL: vrgather_vv:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgather.vv v12, v8, v10
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vv v12, v8, v10
 ; VLOPT-NEXT:    vadd.vv v8, v12, v8
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %idx, iXLen -1)
@@ -5518,9 +5508,8 @@ define <vscale x 4 x i32> @vrgather_vx(<vscale x 4 x i32> %a, iXLen %idx, <vscal
 ;
 ; VLOPT-LABEL: vrgather_vx:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgather.vx v12, v8, a0
 ; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgather.vx v12, v8, a0
 ; VLOPT-NEXT:    vadd.vv v8, v12, v10
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen %idx, iXLen -1)
@@ -5539,9 +5528,8 @@ define <vscale x 4 x i32> @vrgatherei16_vv(<vscale x 4 x i32> %a, <vscale x 4 x
 ;
 ; VLOPT-LABEL: vrgatherei16_vv:
 ; VLOPT:       # %bb.0:
-; VLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; VLOPT-NEXT:    vrgatherei16.vv v12, v8, v10
 ; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrgatherei16.vv v12, v8, v10
 ; VLOPT-NEXT:    vadd.vv v8, v12, v8
 ; VLOPT-NEXT:    ret
   %1 = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i16> %idx, iXLen -1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index b39ba422bd349..52cd3e35e6eb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -1801,4 +1801,63 @@ body: |
     ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0 /* e8 */
     %x:vr = PseudoVMSET_M_B8 -1, 0
     %y:vr = PseudoVMAND_MM_B16  $noreg, %x, 1, 0
+...
+---
+name: vrgatherei16_vv
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv
+    ; CHECK: early-clobber %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_data_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_data_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
 ---
+name: vrgatherei16_vv_incompatible_index_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_index_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_dest_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_dest_emul
+    ; CHECK: early-clobber %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_source_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_source_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0
+...
+---
+name: vrgatherei16_vv_incompatible_index_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_index_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
index e3b2d6c1efe1f..9b58cb3d5c891 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
@@ -893,3 +893,197 @@ define <vscale x 8 x i32> @vmin_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = select <vscale x 8 x i1> %cmp, <vscale x 8 x i32> %va, <vscale x 8 x i32> %vs
   ret <vscale x 8 x i32> %vc
 }
+
+define <vscale x 2 x i8> @vsub_if_uge_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i8> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> %vb
+  %sub = sub nuw <vscale x 2 x i8> %va, %select
+  ret <vscale x 2 x i8> %sub
+}
+
+define <vscale x 2 x i8> @vsub_if_uge_swapped_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i8> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> %vb, <vscale x 2 x i8> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i8> %va, %select
+  ret <vscale x 2 x i8> %sub
+}
+
+define <vscale x 2 x i16> @vsub_if_uge_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i16> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> %vb
+  %sub = sub nuw <vscale x 2 x i16> %va, %select
+  ret <vscale x 2 x i16> %sub
+}
+
+define <vscale x 2 x i16> @vsub_if_uge_swapped_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i16> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> %vb, <vscale x 2 x i16> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i16> %va, %select
+  ret <vscale x 2 x i16> %sub
+}
+
+define <vscale x 2 x i32> @vsub_if_uge_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i32> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %vb
+  %sub = sub nuw <vscale x 2 x i32> %va, %select
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i32> @vsub_if_uge_swapped_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i32> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %vb, <vscale x 2 x i32> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i32> %va, %select
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i64> @vsub_if_uge_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsub.vv v10, v8, v10
+; CHECK-NEXT:    vminu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i64> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %vb
+  %sub = sub nuw <vscale x 2 x i64> %va, %select
+  ret <vscale x 2 x i64> %sub
+}
+
+define <vscale x 2 x i64> @vsub_if_uge_swapped_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vsub_if_uge_swapped_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsub.vv v10, v8, v10
+; CHECK-NEXT:    vminu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %cmp = icmp uge <vscale x 2 x i64> %va, %vb
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> %vb, <vscale x 2 x i64> zeroinitializer
+  %sub = sub nuw <vscale x 2 x i64> %va, %select
+  ret <vscale x 2 x i64> %sub
+}
+
+define <vscale x 2 x i8> @sub_if_uge_C_nxv2i8(<vscale x 2 x i8> %x) {
+; CHECK-LABEL: sub_if_uge_C_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v8, -13
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i8> %x, splat (i8 12)
+  %sub = add <vscale x 2 x i8> %x, splat (i8 -13)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> %sub, <vscale x 2 x i8> %x
+  ret <vscale x 2 x i8> %select
+}
+
+define <vscale x 2 x i16> @sub_if_uge_C_nxv2i16(<vscale x 2 x i16> %x) {
+; CHECK-LABEL: sub_if_uge_C_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, -2001
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i16> %x, splat (i16 2000)
+  %sub = add <vscale x 2 x i16> %x, splat (i16 -2001)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> %sub, <vscale x 2 x i16> %x
+  ret <vscale x 2 x i16> %select
+}
+
+define <vscale x 2 x i32> @sub_if_uge_C_nxv2i32(<vscale x 2 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i32> %x, splat (i32 65520)
+  %sub = add <vscale x 2 x i32> %x, splat (i32 -65521)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %sub, <vscale x 2 x i32> %x
+  ret <vscale x 2 x i32> %select
+}
+
+define <vscale x 2 x i32> @sub_if_uge_C_swapped_nxv2i32(<vscale x 2 x i32> %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 1048560
+; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %cmp = icmp ult <vscale x 2 x i32> %x, splat (i32 65521)
+  %sub = add <vscale x 2 x i32> %x, splat (i32 -65521)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %x, <vscale x 2 x i32> %sub
+  ret <vscale x 2 x i32> %select
+}
+
+define <vscale x 2 x i64> @sub_if_uge_C_nxv2i64(<vscale x 2 x i64> %x) nounwind {
+; RV32-LABEL: sub_if_uge_C_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    li a0, -2
+; RV32-NEXT:    lui a1, 876449
+; RV32-NEXT:    addi a1, a1, -513
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vadd.vv v10, v8, v10
+; RV32-NEXT:    vminu.vv v8, v10, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: sub_if_uge_C_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a0, 1048278
+; RV64-NEXT:    addi a0, a0, -95
+; RV64-NEXT:    slli a0, a0, 12
+; RV64-NEXT:    addi a0, a0, -513
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vadd.vx v10, v8, a0
+; RV64-NEXT:    vminu.vv v8, v10, v8
+; RV64-NEXT:    ret
+  %cmp = icmp ugt <vscale x 2 x i64> %x, splat (i64 5000000000)
+  %sub = add <vscale x 2 x i64> %x, splat (i64 -5000000001)
+  %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> %sub, <vscale x 2 x i64> %x
+  ret <vscale x 2 x i64> %select
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 7fb822d20f892..23c0c826e85e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -18,7 +18,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -31,30 +31,18 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor3_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 699051
-; RV32-NEXT:    addi a2, a2, -1365
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg3e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: load_factor3_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 699051
-; RV64-NEXT:    addi a2, a2, -1365
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg3e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 3
+  %rvl = mul nuw i32 %evl, 3
   %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -66,6 +54,91 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor
   ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
 
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_partial(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_partial:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:    vmv1r.v v8, v7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor3_partial:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:    vmv1r.v v8, v7
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 3
+  %wide.masked.load = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t2, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor3_no_extract(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor3_no_extract:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a2, 12
+; RV32-NEXT:    beq a1, a2, .LBB3_2
+; RV32-NEXT:  # %bb.1: # %bb0
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:    j .LBB3_3
+; RV32-NEXT:  .LBB3_2: # %bb1
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vlseg3e32.v v7, (a0)
+; RV32-NEXT:  .LBB3_3: # %merge
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv1r.v v8, v7
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor3_no_extract:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a2, a1
+; RV64-NEXT:    li a3, 12
+; RV64-NEXT:    beq a2, a3, .LBB3_2
+; RV64-NEXT:  # %bb.1: # %bb0
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:    j .LBB3_3
+; RV64-NEXT:  .LBB3_2: # %bb1
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vlseg3e32.v v7, (a0)
+; RV64-NEXT:  .LBB3_3: # %merge
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv1r.v v8, v7
+; RV64-NEXT:    ret
+  %p = icmp ne i32 %evl, 12
+  br i1 %p, label %bb0, label %bb1
+
+bb0:
+  %rvl.0 = mul nuw i32 %evl, 3
+  %wide.load.0 = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 %rvl.0)
+  %deinterleaved.results.0 = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.load.0)
+  br label %merge
+
+bb1:
+  %wide.load.1 = call <vscale x 6 x i32> @llvm.vp.load(ptr %ptr, <vscale x 6 x i1> splat (i1 true), i32 12)
+  %deinterleaved.results.1 = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave3(<vscale x 6 x i32> %wide.load.1)
+  br label %merge
+
+merge:
+  %deinterleaved.results = phi { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [%deinterleaved.results.0, %bb0], [%deinterleaved.results.1, %bb1]
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 2
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t2, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor4_v2:
 ; RV32:       # %bb.0:
@@ -82,7 +155,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
+  %rvl = mul nuw i32 %evl, 4
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
@@ -100,30 +173,18 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor5_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor5_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 838861
-; RV32-NEXT:    addi a2, a2, -819
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg5e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: load_factor5_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 838861
-; RV64-NEXT:    addi a2, a2, -819
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg5e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 5
+  %rvl = mul nuw i32 %evl, 5
   %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave5(<vscale x 10 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -142,37 +203,18 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor7_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor7_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    lui a1, 149797
-; RV32-NEXT:    addi a1, a1, -1755
-; RV32-NEXT:    mulhu a1, a2, a1
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    srli a2, a2, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg7e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: load_factor7_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    lui a3, 149797
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    addi a1, a3, -1755
-; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    mulhu a1, a3, a1
 ; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    srliw a2, a2, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg7e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 7
+  %rvl = mul nuw i32 %evl, 7
   %wide.masked.load = call <vscale x 14 x i32> @llvm.vp.load(ptr %ptr, <vscale x 14 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave7(<vscale x 14 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -208,7 +250,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 8
+  %rvl = mul nuw i32 %evl, 8
   %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %rvl)
   %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
@@ -247,7 +289,7 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.vec = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -256,30 +298,18 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 define void @store_factor3_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor3_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 699051
-; RV32-NEXT:    addi a2, a2, -1365
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vsseg3e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_factor3_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 699051
-; RV64-NEXT:    addi a2, a2, -1365
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg3e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 3
+  %rvl = mul nuw i32 %evl, 3
   %interleaved.vec = call <vscale x 3 x i32> @llvm.vector.interleave3(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2)
   call void @llvm.vp.store(<vscale x 3 x i32> %interleaved.vec, ptr %ptr, <vscale x 3 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -305,7 +335,7 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vmv1r.v v11, v9
 ; RV64-NEXT:    vsseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 8
+  %rvl = mul nuw i32 %evl, 8
   %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -314,30 +344,18 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 define void @store_factor5_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor5_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    lui a2, 838861
-; RV32-NEXT:    addi a2, a2, -819
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vsseg5e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_factor5_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    lui a2, 838861
-; RV64-NEXT:    addi a2, a2, -819
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    mulhu a1, a1, a2
-; RV64-NEXT:    srli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg5e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 5
+  %rvl = mul nuw i32 %evl, 5
   %interleaved.vec = call <vscale x 5 x i32> @llvm.vector.interleave5(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4)
   call void @llvm.vp.store(<vscale x 5 x i32> %interleaved.vec, ptr %ptr, <vscale x 5 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -346,37 +364,18 @@ define void @store_factor5_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <v
 define void @store_factor7_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4, <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor7_v2:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    lui a1, 149797
-; RV32-NEXT:    addi a1, a1, -1755
-; RV32-NEXT:    mulhu a1, a2, a1
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    srli a2, a2, 1
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vsseg7e32.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_factor7_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    lui a3, 149797
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    addi a1, a3, -1755
-; RV64-NEXT:    slli a3, a2, 32
 ; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    mulhu a1, a3, a1
 ; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    subw a2, a2, a1
-; RV64-NEXT:    srliw a2, a2, 1
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg7e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 7
+  %rvl = mul nuw i32 %evl, 7
   %interleaved.vec = call <vscale x 7 x i32> @llvm.vector.interleave7(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v2, <vscale x 1 x i32> %v3, <vscale x 1 x i32> %v4,  <vscale x 1 x i32> %v5, <vscale x 1 x i32> %v6)
   call void @llvm.vp.store(<vscale x 7 x i32> %interleaved.vec, ptr %ptr, <vscale x 7 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -410,7 +409,7 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vmv1r.v v15, v9
 ; RV64-NEXT:    vsseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 8
+  %rvl = mul nuw i32 %evl, 8
   %interleaved.vec = call <vscale x 8 x i32> @llvm.vector.interleave8.nxv8i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -432,7 +431,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -459,7 +458,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
+  %rvl = mul nuw i32 %evl, 4
   %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask, i32 %rvl)
   %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
@@ -495,7 +494,7 @@ define void @masked_store_factor2_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
   %interleaved.vec = tail call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   tail call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> %interleaved.mask, i32 %rvl)
@@ -520,7 +519,7 @@ define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, p
 ; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -610,7 +609,7 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -642,13 +641,58 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ; RV64-NEXT:    vmv1r.v v11, v9
 ; RV64-NEXT:    vsseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
+  %rvl = mul nuw i32 %evl, 4
   %interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave4.nxv4i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
   %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   ret void
 }
 
+define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor2_oneactive:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg2e32.v v7, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor2_oneactive:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg2e32.v v7, (a0)
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 4
+  %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  ret <vscale x 2 x i32> %t0
+}
+
+define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor5_oneactive:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg5e32.v v5, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor5_oneactive:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg5e32.v v5, (a0)
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 5
+  %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave5(<vscale x 10 x i32> %wide.masked.load)
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 3
+  ret <vscale x 2 x i32> %t3
+}
+
+
 ; Negative tests
 
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
@@ -724,7 +768,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    vnsrl.wx v9, v10, a0
 ; RV64-NEXT:    vnsrl.wi v8, v10, 0
 ; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 2
+  %rvl = mul nuw i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index 206838917d004..ad2ed47e67e64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -153,20 +153,19 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vlm.v v10, (a2)
-; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v10
-; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
-; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v10, v11, v12
+; NO_FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v9
 ; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -174,20 +173,19 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vlm.v v10, (a2)
-; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; FOLDING-NEXT:    vmv.v.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; FOLDING-NEXT:    vmv.v.v v0, v10
-; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; FOLDING-NEXT:    vmul.vv v9, v12, v9
-; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; FOLDING-NEXT:    vmul.vv v10, v11, v12
+; FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v9
 ; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
@@ -209,20 +207,19 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vlm.v v10, (a2)
-; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v10
-; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
-; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v10, v11, v12
+; NO_FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v9
 ; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -230,20 +227,19 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vlm.v v10, (a2)
-; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmerge.vim v11, v10, -1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vmerge.vim v12, v10, -1, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
-; FOLDING-NEXT:    vmv1r.v v0, v10
-; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
-; FOLDING-NEXT:    vmul.vv v9, v12, v9
-; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmerge.vim v9, v10, -1, v0
+; FOLDING-NEXT:    vmul.vv v10, v11, v12
+; FOLDING-NEXT:    vsub.vv v11, v11, v9
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vadd.vi v10, v10, -1, v0.t
-; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vadd.vi v9, v9, -1, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v9
 ; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
@@ -444,16 +440,14 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vmv.v.i v10, 0
-; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv.v.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vmv.v.v v0, v9
-; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vadd.vv v10, v9, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v9, v8
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -461,16 +455,14 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vmv.v.i v10, 0
-; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv.v.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
-; FOLDING-NEXT:    vmv.v.v v0, v9
-; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a2)
+; FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vadd.vv v10, v9, v8
+; FOLDING-NEXT:    vsub.vv v8, v9, v8
+; FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
@@ -492,16 +484,14 @@ define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING:       # %bb.0:
 ; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a2)
-; NO_FOLDING-NEXT:    vlm.v v9, (a1)
-; NO_FOLDING-NEXT:    vmv.v.i v10, 0
-; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vmv.v.i v8, 0
+; NO_FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a2)
+; NO_FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; NO_FOLDING-NEXT:    vlm.v v0, (a1)
+; NO_FOLDING-NEXT:    vadd.vv v10, v9, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v9, v8
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
@@ -509,16 +499,14 @@ define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a2)
-; FOLDING-NEXT:    vlm.v v9, (a1)
-; FOLDING-NEXT:    vmv.v.i v10, 0
-; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
-; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
-; FOLDING-NEXT:    vadd.vv v10, v11, v8
-; FOLDING-NEXT:    vsub.vv v8, v11, v8
-; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vmv.v.i v8, 0
+; FOLDING-NEXT:    vmerge.vim v9, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a2)
+; FOLDING-NEXT:    vmerge.vim v8, v8, 1, v0
+; FOLDING-NEXT:    vlm.v v0, (a1)
+; FOLDING-NEXT:    vadd.vv v10, v9, v8
+; FOLDING-NEXT:    vsub.vv v8, v9, v8
+; FOLDING-NEXT:    vor.vv v10, v10, v9, v0.t
 ; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index 9cdec6a9ff2e9..30044ad580143 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -494,17 +494,17 @@ define <vscale x 8 x double> @vfmerge_nzv_nxv8f64(<vscale x 8 x double> %va, <vs
 define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %va, <vscale x 16 x double> %vb) {
 ; CHECK-LABEL: vselect_combine_regression:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmseq.vi v24, v16, 0
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vle64.v v16, (a1), v0.t
+; CHECK-NEXT:    vmseq.vi v0, v24, 0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vle64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %cond = icmp eq <vscale x 16 x i64> %va, zeroinitializer
   %sel = select <vscale x 16 x i1> %cond, <vscale x 16 x double> %vb, <vscale x 16 x double> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index 261a1f8fd2c6c..7990dfc0880a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -304,27 +304,27 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-NEXT:    li t1, 0
 ; RV64X60-NEXT:    addi s1, a7, -1
 ; RV64X60-NEXT:    zext.w s1, s1
-; RV64X60-NEXT:    mul t2, a1, s1
-; RV64X60-NEXT:    mul t3, a3, s1
-; RV64X60-NEXT:    mul t4, a5, s1
+; RV64X60-NEXT:    mul t3, a1, s1
+; RV64X60-NEXT:    mul t4, a3, s1
+; RV64X60-NEXT:    mul t5, a5, s1
 ; RV64X60-NEXT:    add s0, a0, a6
-; RV64X60-NEXT:    add s1, a2, a6
-; RV64X60-NEXT:    add t5, a4, a6
-; RV64X60-NEXT:    add s0, s0, t2
 ; RV64X60-NEXT:    csrr t2, vlenb
-; RV64X60-NEXT:    add t3, t3, s1
+; RV64X60-NEXT:    add s1, a2, a6
+; RV64X60-NEXT:    add t3, t3, s0
+; RV64X60-NEXT:    add s0, a4, a6
+; RV64X60-NEXT:    add t4, t4, s1
 ; RV64X60-NEXT:    li t6, 32
-; RV64X60-NEXT:    add t4, t4, t5
-; RV64X60-NEXT:    sltu t3, a0, t3
-; RV64X60-NEXT:    sltu s1, a2, s0
-; RV64X60-NEXT:    and t3, t3, s1
-; RV64X60-NEXT:    or t5, a1, a3
-; RV64X60-NEXT:    sltu s1, a0, t4
-; RV64X60-NEXT:    sltu s0, a4, s0
-; RV64X60-NEXT:    slti t4, t5, 0
+; RV64X60-NEXT:    add t5, t5, s0
+; RV64X60-NEXT:    sltu s0, a0, t4
+; RV64X60-NEXT:    sltu s1, a2, t3
+; RV64X60-NEXT:    and t4, s0, s1
+; RV64X60-NEXT:    or s2, a1, a3
+; RV64X60-NEXT:    sltu s0, a0, t5
+; RV64X60-NEXT:    sltu s1, a4, t3
+; RV64X60-NEXT:    slti t3, s2, 0
 ; RV64X60-NEXT:    and s0, s0, s1
 ; RV64X60-NEXT:    or s1, a1, a5
-; RV64X60-NEXT:    or t4, t3, t4
+; RV64X60-NEXT:    or t4, t4, t3
 ; RV64X60-NEXT:    slli t3, t2, 1
 ; RV64X60-NEXT:    slti s1, s1, 0
 ; RV64X60-NEXT:    or s0, s0, s1
diff --git a/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir b/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir
new file mode 100644
index 0000000000000..9ab7f41045c4d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir
@@ -0,0 +1,198 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -x mir -run-pass=prologepilog  -verify-machineinstrs < %s \
+# RUN:  | FileCheck %s -check-prefixes=RV64I
+# RUN: llc -mtriple=riscv32 -x mir -run-pass=prologepilog  -verify-machineinstrs < %s \
+# RUN:  | FileCheck %s -check-prefixes=RV32I
+--- |
+  ; Function Attrs: uwtable
+  define void @no_reserved_call_frame(i64 %n) #0 {
+  entry:
+    %v = alloca i32, i64 %n, align 4
+    call void @callee_stack_args(ptr %v, [518 x i64] poison)
+    ret void
+  }
+
+  declare void @callee_stack_args(ptr, [518 x i64]) #1
+
+  attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+m" }
+  attributes #1 = { "target-features"="+m" }
+...
+---
+name:            no_reserved_call_frame
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$x10', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: v, type: variable-sized, offset: 0, alignment: 1, stack-id: default,
+      callee-saved-register: '', callee-saved-restored: true, local-offset: 0,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  varArgsFrameIndex: 0
+  varArgsSaveSize: 0
+body:             |
+  ; RV64I-LABEL: name: no_reserved_call_frame
+  ; RV64I: bb.0.entry:
+  ; RV64I-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64I-NEXT:   liveins: $x10, $x1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = frame-setup ADDI $x2, -16
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; RV64I-NEXT:   frame-setup SD killed $x1, $x2, 8 :: (store (s64) into %stack.1)
+  ; RV64I-NEXT:   frame-setup SD killed $x8, $x2, 0 :: (store (s64) into %stack.2)
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -8
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -16
+  ; RV64I-NEXT:   $x8 = frame-setup ADDI $x2, 16
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $x8, 0
+  ; RV64I-NEXT:   renamable $x10 = SLLI killed renamable $x10, 2
+  ; RV64I-NEXT:   renamable $x10 = nuw ADDI killed renamable $x10, 15
+  ; RV64I-NEXT:   renamable $x10 = ANDI killed renamable $x10, -16
+  ; RV64I-NEXT:   renamable $x10 = SUB $x2, killed renamable $x10
+  ; RV64I-NEXT:   renamable $x11 = LUI 1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT: bb.1.entry:
+  ; RV64I-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; RV64I-NEXT:   liveins: $x10, $x11
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = SUB $x2, renamable $x11
+  ; RV64I-NEXT:   SD $x0, $x2, 0
+  ; RV64I-NEXT:   BLT renamable $x10, $x2, %bb.1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT: bb.2.entry:
+  ; RV64I-NEXT:   liveins: $x10
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = ADDI renamable $x10, 0
+  ; RV64I-NEXT:   $x11 = LUI 1
+  ; RV64I-NEXT:   $x2 = SUB $x2, killed $x11
+  ; RV64I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+  ; RV64I-NEXT:   $x10 = LUI 1
+  ; RV64I-NEXT:   $x2 = ADD $x2, killed $x10
+  ; RV64I-NEXT:   $x2 = frame-destroy ADDI $x8, -16
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa $x2, 16
+  ; RV64I-NEXT:   $x1 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.1)
+  ; RV64I-NEXT:   $x8 = frame-destroy LD $x2, 0 :: (load (s64) from %stack.2)
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
+  ; RV64I-NEXT:   $x2 = frame-destroy ADDI $x2, 16
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; RV64I-NEXT:   PseudoRET
+  ;
+  ; RV32I-LABEL: name: no_reserved_call_frame
+  ; RV32I: bb.0.entry:
+  ; RV32I-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32I-NEXT:   liveins: $x10, $x1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = frame-setup ADDI $x2, -16
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; RV32I-NEXT:   frame-setup SW killed $x1, $x2, 12 :: (store (s32) into %stack.1)
+  ; RV32I-NEXT:   frame-setup SW killed $x8, $x2, 8 :: (store (s32) into %stack.2)
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -4
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -8
+  ; RV32I-NEXT:   $x8 = frame-setup ADDI $x2, 16
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $x8, 0
+  ; RV32I-NEXT:   renamable $x10 = SLLI killed renamable $x10, 2
+  ; RV32I-NEXT:   renamable $x10 = nuw ADDI killed renamable $x10, 15
+  ; RV32I-NEXT:   renamable $x10 = ANDI killed renamable $x10, -16
+  ; RV32I-NEXT:   renamable $x10 = SUB $x2, killed renamable $x10
+  ; RV32I-NEXT:   renamable $x11 = LUI 1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT: bb.1.entry:
+  ; RV32I-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; RV32I-NEXT:   liveins: $x10, $x11
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = SUB $x2, renamable $x11
+  ; RV32I-NEXT:   SD $x0, $x2, 0
+  ; RV32I-NEXT:   BLT renamable $x10, $x2, %bb.1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT: bb.2.entry:
+  ; RV32I-NEXT:   liveins: $x10
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = ADDI renamable $x10, 0
+  ; RV32I-NEXT:   $x11 = LUI 1
+  ; RV32I-NEXT:   $x2 = SUB $x2, killed $x11
+  ; RV32I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+  ; RV32I-NEXT:   $x10 = LUI 1
+  ; RV32I-NEXT:   $x2 = ADD $x2, killed $x10
+  ; RV32I-NEXT:   $x2 = frame-destroy ADDI $x8, -16
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa $x2, 16
+  ; RV32I-NEXT:   $x1 = frame-destroy LW $x2, 12 :: (load (s32) from %stack.1)
+  ; RV32I-NEXT:   $x8 = frame-destroy LW $x2, 8 :: (load (s32) from %stack.2)
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
+  ; RV32I-NEXT:   $x2 = frame-destroy ADDI $x2, 16
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; RV32I-NEXT:   PseudoRET
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $x10
+
+    renamable $x10 = SLLI killed renamable $x10, 2
+    renamable $x10 = nuw ADDI killed renamable $x10, 15
+    renamable $x10 = ANDI killed renamable $x10, -16
+    renamable $x10 = SUB $x2, killed renamable $x10
+    renamable $x11 = LUI 1
+
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $x10, $x11
+
+    $x2 = SUB $x2, renamable $x11
+    SD $x0, $x2, 0
+    BLT renamable $x10, $x2, %bb.1
+
+  bb.2.entry:
+    liveins: $x10
+
+    $x2 = ADDI renamable $x10, 0
+    ADJCALLSTACKDOWN 4088, 0, implicit-def dead $x2, implicit $x2
+    PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+    ADJCALLSTACKUP 4088, 0, implicit-def dead $x2, implicit $x2
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/stack-protector-target.ll b/llvm/test/CodeGen/RISCV/stack-protector-target.ll
index a4bd0e9ceac98..b01b044db5d6f 100644
--- a/llvm/test/CodeGen/RISCV/stack-protector-target.ll
+++ b/llvm/test/CodeGen/RISCV/stack-protector-target.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=riscv64-linux < %s | FileCheck --check-prefix=LINUX-RISCV64 %s
 ; RUN: llc -mtriple=riscv64-fuchsia < %s | FileCheck --check-prefix=FUCHSIA-RISCV64 %s
 ; RUN: llc -mtriple=riscv64-android < %s | FileCheck --check-prefix=ANDROID-RISCV64 %s
+; RUN: llc -mtriple=riscv64-openbsd < %s | FileCheck --check-prefix=OPENBSD-RISCV64 %s
 
 define void @func() sspreq nounwind {
 ; LINUX-RISCV64-LABEL: func:
@@ -63,6 +64,29 @@ define void @func() sspreq nounwind {
 ; ANDROID-RISCV64-NEXT:    ret
 ; ANDROID-RISCV64-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
 ; ANDROID-RISCV64-NEXT:    call __stack_chk_fail
+;
+; OPENBSD-RISCV64-LABEL: func:
+; OPENBSD-RISCV64:       # %bb.0:
+; OPENBSD-RISCV64-NEXT:    addi sp, sp, -32
+; OPENBSD-RISCV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; OPENBSD-RISCV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; OPENBSD-RISCV64-NEXT:    lui s0, %hi(__guard_local)
+; OPENBSD-RISCV64-NEXT:    ld a0, %lo(__guard_local)(s0)
+; OPENBSD-RISCV64-NEXT:    sd a0, 8(sp)
+; OPENBSD-RISCV64-NEXT:    addi a0, sp, 4
+; OPENBSD-RISCV64-NEXT:    call capture
+; OPENBSD-RISCV64-NEXT:    ld a0, %lo(__guard_local)(s0)
+; OPENBSD-RISCV64-NEXT:    ld a1, 8(sp)
+; OPENBSD-RISCV64-NEXT:    bne a0, a1, .LBB0_2
+; OPENBSD-RISCV64-NEXT:  # %bb.1: # %SP_return
+; OPENBSD-RISCV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; OPENBSD-RISCV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; OPENBSD-RISCV64-NEXT:    addi sp, sp, 32
+; OPENBSD-RISCV64-NEXT:    ret
+; OPENBSD-RISCV64-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; OPENBSD-RISCV64-NEXT:    lui a0, %hi(.LSSH)
+; OPENBSD-RISCV64-NEXT:    addi a0, a0, %lo(.LSSH)
+; OPENBSD-RISCV64-NEXT:    call __stack_smash_handler
   %1 = alloca i32, align 4
   call void @capture(ptr %1)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
new file mode 100644
index 0000000000000..854d0b659ea73
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+xandesbfhcvt -target-abi ilp32f \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+xandesbfhcvt -target-abi lp64f \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+
+define float @fcvt_s_bf16(bfloat %a) nounwind {
+; CHECK-LABEL: fcvt_s_bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.fcvt.s.bf16 fa0, fa0
+; CHECK-NEXT:    ret
+  %1 = fpext bfloat %a to float
+  ret float %1
+}
+
+define bfloat @fcvt_bf16_s(float %a) nounwind {
+; CHECK-LABEL: fcvt_bf16_s:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.fcvt.bf16.s fa0, fa0
+; CHECK-NEXT:    ret
+  %1 = fptrunc float %a to bfloat
+  ret bfloat %1
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
index f227fa9aa423d..2fa06517508ce 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
@@ -105,6 +105,7 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ;
 ; RV32ZBBXQCIBM-LABEL: test_cttz_i16:
 ; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
 ; RV32ZBBXQCIBM-NEXT:    qc.insbi a0, -1, 1, 16
 ; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
 ; RV32ZBBXQCIBM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
index 6b7f9ae856625..88054a691bad1 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
@@ -47,6 +47,29 @@ define i32 @test_insbi_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_mask_mv(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: test_insbi_mask_mv:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 16
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    mv a0, a1
+; RV32IXQCIBM-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    mv a0, a1
+; RV32IXQCIBMZBS-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %b, 65535
+  ret i32 %or
+}
+
 define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
 ; RV32I-LABEL: test_insbi_shifted_mask:
 ; RV32I:       # %bb.0:
@@ -67,6 +90,36 @@ define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_shifted_mask_multiple_uses(i32 %a) nounwind {
+; RV32I-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 15
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    addi a0, a0, 10
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    lui a1, 15
+; RV32IXQCIBM-NEXT:    or a1, a1, a0
+; RV32IXQCIBM-NEXT:    addi a0, a0, 10
+; RV32IXQCIBM-NEXT:    xor a0, a0, a1
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    lui a1, 15
+; RV32IXQCIBMZBS-NEXT:    or a1, a1, a0
+; RV32IXQCIBMZBS-NEXT:    addi a0, a0, 10
+; RV32IXQCIBMZBS-NEXT:    xor a0, a0, a1
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %a, 61440
+  %add = add i32 %a, 10
+  %xor = xor i32 %or, %add
+  ret i32 %xor
+}
+
 define i32 @test_single_bit_set(i32 %a) nounwind {
 ; RV32I-LABEL: test_single_bit_set:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/xqcicli.ll b/llvm/test/CodeGen/RISCV/xqcicli.ll
new file mode 100644
index 0000000000000..b0d51429556ef
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcicli.ll
@@ -0,0 +1,717 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test that we are able to generate the Xqcicli instructions
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli,+experimental-xqcics -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICLI
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli,+experimental-xqcicm -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICLI
+
+define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bne a1, a2, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB0_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_eq:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lieq a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %b, %x
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ne:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a1, a2, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB1_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ne:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.line a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %b, %x
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_slt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_slt:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bge a1, a2, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB2_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_slt:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lilt a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %b, %x
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_sge(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_sge:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    blt a1, a2, .LBB3_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB3_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_sge:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lige a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %b, %x
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_uge(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_uge:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bltu a1, a2, .LBB4_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB4_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_uge:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligeu a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %b, %x
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ult(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ult:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bgeu a1, a2, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB5_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ult:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.liltu a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %b, %x
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eq_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a1, a2, .LBB6_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB6_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_eq_c:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.line a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %b, %x
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ne_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ne_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bne a1, a2, .LBB7_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB7_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ne_c:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lieq a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %b, %x
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_slt_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_slt_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    blt a1, a2, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB8_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_slt_c:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lige a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %b, %x
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_sge_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_sge_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bge a1, a2, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB9_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_sge_c:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lilt a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %b, %x
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_uge_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_uge_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bgeu a1, a2, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB10_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_uge_c:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.liltu a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %b, %x
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ult_c(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ult_c:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bltu a1, a2, .LBB11_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB11_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ult_c:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligeu a0, a1, a2, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %b, %x
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eqi(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eqi:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bne a1, a2, .LBB12_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB12_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_eqi:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lieqi a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %b, 12
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_nei(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_nei:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    beq a1, a2, .LBB13_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB13_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_nei:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.linei a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %b, 12
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_slti(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_slti:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bge a1, a2, .LBB14_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB14_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_slti:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lilti a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %b, 12
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_sgei(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_sgei:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:    bge a2, a1, .LBB15_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB15_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_sgei:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligei a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %b, 12
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ulti(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ulti:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bgeu a1, a2, .LBB16_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB16_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ulti:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.liltui a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %b, 12
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ugei(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ugei:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:    bgeu a2, a1, .LBB17_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB17_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ugei:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligeui a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %b, 12
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eqi_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eqi_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bne a1, a2, .LBB18_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB18_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_eqi_c1:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lieqi a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 12, %b
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_nei_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_nei_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    beq a1, a2, .LBB19_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB19_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_nei_c1:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.linei a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 12, %b
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_slti_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_slti_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bge a2, a1, .LBB20_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB20_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_slti_c1:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligei a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 12, %b
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_sgei_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_sgei_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 13
+; RV32I-NEXT:    bge a1, a2, .LBB21_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB21_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_sgei_c1:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lilti a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 12, %b
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ulti_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ulti_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bgeu a2, a1, .LBB22_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB22_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ulti_c1:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligeui a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 12, %b
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ugei_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ugei_c1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 13
+; RV32I-NEXT:    bgeu a1, a2, .LBB23_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB23_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ugei_c1:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.liltui a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 12, %b
+  %sel = select i1 %cmp, i32 11, i32 %a
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eqi_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eqi_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    beq a1, a2, .LBB24_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB24_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_eqi_c2:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.linei a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 12, %b
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_nei_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_nei_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bne a1, a2, .LBB25_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB25_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_nei_c2:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lieqi a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 12, %b
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_slti_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_slti_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    blt a2, a1, .LBB26_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB26_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_slti_c2:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lilti a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 12, %b
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_sgei_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_sgei_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 13
+; RV32I-NEXT:    blt a1, a2, .LBB27_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB27_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_sgei_c2:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligei a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 12, %b
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ulti_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ulti_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bltu a2, a1, .LBB28_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB28_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ulti_c2:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.liltui a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 12, %b
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ugei_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ugei_c2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 13
+; RV32I-NEXT:    bltu a1, a2, .LBB29_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB29_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ugei_c2:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligeui a0, a1, 13, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 12, %b
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eqi_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eqi_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    beq a1, a2, .LBB30_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB30_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_eqi_c3:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.linei a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %b, 12
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_nei_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_nei_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bne a1, a2, .LBB31_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB31_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_nei_c3:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lieqi a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %b, 12
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_slti_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_slti_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    blt a1, a2, .LBB32_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB32_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_slti_c3:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligei a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %b, 12
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_sgei_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_sgei_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:    blt a2, a1, .LBB33_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB33_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_sgei_c3:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.lilti a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %b, 12
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ulti_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ulti_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    bltu a1, a2, .LBB34_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB34_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ulti_c3:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.ligeui a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %b, 12
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ugei_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ugei_c3:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:    bltu a2, a1, .LBB35_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:  .LBB35_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICLI-LABEL: select_cc_example_ugei_c3:
+; RV32IXQCICLI:       # %bb.0: # %entry
+; RV32IXQCICLI-NEXT:    qc.liltui a0, a1, 12, 11
+; RV32IXQCICLI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %b, 12
+  %sel = select i1 %cmp, i32 %a, i32 11
+  ret i32 %sel
+}
+
diff --git a/llvm/test/CodeGen/RISCV/xqcisls.ll b/llvm/test/CodeGen/RISCV/xqcisls.ll
index 2bc4834ad3559..709dc4ce074dc 100644
--- a/llvm/test/CodeGen/RISCV/xqcisls.ll
+++ b/llvm/test/CodeGen/RISCV/xqcisls.ll
@@ -206,7 +206,7 @@ define void @sw_ri(i32* %a, i32 %b, i32 %c) {
   ret void
 }
 
-define i8 @lrb_anyext(ptr %a, i64 %b) {
+define i8 @lrb_anyext(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrb_anyext:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
@@ -223,175 +223,106 @@ define i8 @lrb_anyext(ptr %a, i64 %b) {
 ; RV32IZBAXQCISLS:       # %bb.0:
 ; RV32IZBAXQCISLS-NEXT:    qc.lrbu a0, a0, a1, 0
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
+  %1 = getelementptr i8, ptr %a, i32 %b
   %2 = load i8, ptr %1, align 1
   ret i8 %2
 }
 
-define i64 @lrb(ptr %a, i64 %b) {
+define i32 @lrb(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrb:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lb a1, 0(a0)
-; RV32I-NEXT:    srai a2, a1, 31
-; RV32I-NEXT:    add a0, a1, a1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    add a2, a2, a2
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    lb a0, 0(a0)
+; RV32I-NEXT:    add a0, a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IZBA-LABEL: lrb:
 ; RV32IZBA:       # %bb.0:
 ; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lb a1, 0(a0)
-; RV32IZBA-NEXT:    srai a2, a1, 31
-; RV32IZBA-NEXT:    add a0, a1, a1
-; RV32IZBA-NEXT:    sltu a1, a0, a1
-; RV32IZBA-NEXT:    add a2, a2, a2
-; RV32IZBA-NEXT:    add a1, a2, a1
+; RV32IZBA-NEXT:    lb a0, 0(a0)
+; RV32IZBA-NEXT:    add a0, a0, a0
 ; RV32IZBA-NEXT:    ret
 ;
 ; RV32IZBAXQCISLS-LABEL: lrb:
 ; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrb a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    srai a2, a1, 31
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
-; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
-; RV32IZBAXQCISLS-NEXT:    add a1, a2, a1
+; RV32IZBAXQCISLS-NEXT:    qc.lrb a0, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    add a0, a0, a0
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
+  %1 = getelementptr i8, ptr %a, i32 %b
   %2 = load i8, ptr %1, align 1
-  %3 = sext i8 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
+  %3 = sext i8 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
-define i8 @lurb_anyext(ptr %a, i32 %b) {
-; RV32I-LABEL: lurb_anyext:
+define i32 @lrbu(ptr %a, i32 %b) {
+; RV32I-LABEL: lrbu:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    add a0, a0, a0
 ; RV32I-NEXT:    ret
 ;
-; RV32IZBA-LABEL: lurb_anyext:
+; RV32IZBA-LABEL: lrbu:
 ; RV32IZBA:       # %bb.0:
 ; RV32IZBA-NEXT:    add a0, a0, a1
 ; RV32IZBA-NEXT:    lbu a0, 0(a0)
+; RV32IZBA-NEXT:    add a0, a0, a0
 ; RV32IZBA-NEXT:    ret
 ;
-; RV32IZBAXQCISLS-LABEL: lurb_anyext:
+; RV32IZBAXQCISLS-LABEL: lrbu:
 ; RV32IZBAXQCISLS:       # %bb.0:
 ; RV32IZBAXQCISLS-NEXT:    qc.lrbu a0, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    add a0, a0, a0
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = getelementptr i8, ptr %a, i64 %1
-  %3 = load i8, ptr %2, align 1
-  ret i8 %3
+  %1 = getelementptr i8, ptr %a, i32 %b
+  %2 = load i8, ptr %1, align 1
+  %3 = zext i8 %2 to i32
+  %4 = add i32 %3, %3
+  ret i32 %4
 }
 
-define i64 @lurb(ptr %a, i32 %b) {
-; RV32I-LABEL: lurb:
+define i64 @lrd(ptr %a, i32 %b) {
+; RV32I-LABEL: lrd:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lb a1, 0(a0)
-; RV32I-NEXT:    srai a2, a1, 31
+; RV32I-NEXT:    lw a1, 0(a0)
+; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    add a0, a1, a1
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a2, a2, a2
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
-; RV32IZBA-LABEL: lurb:
+; RV32IZBA-LABEL: lrd:
 ; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lb a1, 0(a0)
-; RV32IZBA-NEXT:    srai a2, a1, 31
+; RV32IZBA-NEXT:    sh3add a0, a1, a0
+; RV32IZBA-NEXT:    lw a1, 0(a0)
+; RV32IZBA-NEXT:    lw a2, 4(a0)
 ; RV32IZBA-NEXT:    add a0, a1, a1
 ; RV32IZBA-NEXT:    sltu a1, a0, a1
 ; RV32IZBA-NEXT:    add a2, a2, a2
 ; RV32IZBA-NEXT:    add a1, a2, a1
 ; RV32IZBA-NEXT:    ret
 ;
-; RV32IZBAXQCISLS-LABEL: lurb:
+; RV32IZBAXQCISLS-LABEL: lrd:
 ; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrb a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    srai a2, a1, 31
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
-; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
-; RV32IZBAXQCISLS-NEXT:    add a1, a2, a1
-; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = getelementptr i8, ptr %a, i64 %1
-  %3 = load i8, ptr %2, align 1
-  %4 = sext i8 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
-}
-
-define i64 @lrbu(ptr %a, i64 %b) {
-; RV32I-LABEL: lrbu:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    add a0, a1, a1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    ret
-;
-; RV32IZBA-LABEL: lrbu:
-; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lbu a1, 0(a0)
-; RV32IZBA-NEXT:    add a0, a1, a1
-; RV32IZBA-NEXT:    sltu a1, a0, a1
-; RV32IZBA-NEXT:    ret
-;
-; RV32IZBAXQCISLS-LABEL: lrbu:
-; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrbu a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
-; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = getelementptr i8, ptr %a, i64 %b
-  %2 = load i8, ptr %1, align 1
-  %3 = zext i8 %2 to i64
-  %4 = add i64 %3, %3
-  ret i64 %4
-}
-
-define i64 @lurbu(ptr %a, i32 %b) {
-; RV32I-LABEL: lurbu:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lbu a1, 0(a0)
-; RV32I-NEXT:    add a0, a1, a1
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    ret
-;
-; RV32IZBA-LABEL: lurbu:
-; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    lbu a1, 0(a0)
-; RV32IZBA-NEXT:    add a0, a1, a1
-; RV32IZBA-NEXT:    sltu a1, a0, a1
-; RV32IZBA-NEXT:    ret
-;
-; RV32IZBAXQCISLS-LABEL: lurbu:
-; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    qc.lrbu a1, a0, a1, 0
-; RV32IZBAXQCISLS-NEXT:    add a0, a1, a1
-; RV32IZBAXQCISLS-NEXT:    sltu a1, a0, a1
+; RV32IZBAXQCISLS-NEXT:    qc.lrw a2, a0, a1, 3
+; RV32IZBAXQCISLS-NEXT:    addi a0, a0, 4
+; RV32IZBAXQCISLS-NEXT:    qc.lrw a1, a0, a1, 3
+; RV32IZBAXQCISLS-NEXT:    add a0, a2, a2
+; RV32IZBAXQCISLS-NEXT:    sltu a2, a0, a2
+; RV32IZBAXQCISLS-NEXT:    add a1, a1, a1
+; RV32IZBAXQCISLS-NEXT:    add a1, a1, a2
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = getelementptr i8, ptr %a, i64 %1
-  %3 = load i8, ptr %2, align 1
-  %4 = zext i8 %3 to i64
-  %5 = add i64 %4, %4
-  ret i64 %5
+  %1 = getelementptr i64, ptr %a, i32 %b
+  %2 = load i64, ptr %1, align 8
+  %3 = add i64 %2, %2
+  ret i64 %3
 }
 
-define i64 @lrd_2(ptr %a, i64 %b) {
+define i64 @lrd_2(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrd_2:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 3
@@ -426,67 +357,134 @@ define i64 @lrd_2(ptr %a, i64 %b) {
 ; RV32IZBAXQCISLS-NEXT:    add a1, a1, a1
 ; RV32IZBAXQCISLS-NEXT:    add a1, a1, a2
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = add i64 %b, 12
-  %2 = getelementptr i64, ptr %a, i64 %1
+  %1 = add i32 %b, 12
+  %2 = getelementptr i64, ptr %a, i32 %1
   %3 = load i64, ptr %2, align 8
   %4 = add i64 %3, %3
   ret i64 %4
 }
 
-define void @srb(ptr %a, i64 %b, i8 %c) {
+define void @srb(ptr %a, i32 %b, i8 %c) {
 ; RV32I-LABEL: srb:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a3, a3, a3
+; RV32I-NEXT:    add a2, a2, a2
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    sb a3, 0(a0)
+; RV32I-NEXT:    sb a2, 0(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IZBA-LABEL: srb:
 ; RV32IZBA:       # %bb.0:
-; RV32IZBA-NEXT:    add a3, a3, a3
+; RV32IZBA-NEXT:    add a2, a2, a2
 ; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    sb a3, 0(a0)
+; RV32IZBA-NEXT:    sb a2, 0(a0)
 ; RV32IZBA-NEXT:    ret
 ;
 ; RV32IZBAXQCISLS-LABEL: srb:
 ; RV32IZBAXQCISLS:       # %bb.0:
-; RV32IZBAXQCISLS-NEXT:    add a3, a3, a3
-; RV32IZBAXQCISLS-NEXT:    qc.srb a3, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
+; RV32IZBAXQCISLS-NEXT:    qc.srb a2, a0, a1, 0
 ; RV32IZBAXQCISLS-NEXT:    ret
   %1 = add i8 %c, %c
-  %2 = getelementptr i8, ptr %a, i64 %b
+  %2 = getelementptr i8, ptr %a, i32 %b
   store i8 %1, ptr %2, align 1
   ret void
 }
 
-define void @surb(ptr %a, i32 %b, i8 %c) {
-; RV32I-LABEL: surb:
+define void @srh(ptr %a, i32 %b, i16 %c) {
+; RV32I-LABEL: srh:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    add a2, a2, a2
+; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    sb a2, 0(a0)
+; RV32I-NEXT:    sh a2, 0(a0)
 ; RV32I-NEXT:    ret
 ;
-; RV32IZBA-LABEL: surb:
+; RV32IZBA-LABEL: srh:
 ; RV32IZBA:       # %bb.0:
 ; RV32IZBA-NEXT:    add a2, a2, a2
-; RV32IZBA-NEXT:    add a0, a0, a1
-; RV32IZBA-NEXT:    sb a2, 0(a0)
+; RV32IZBA-NEXT:    sh1add a0, a1, a0
+; RV32IZBA-NEXT:    sh a2, 0(a0)
 ; RV32IZBA-NEXT:    ret
 ;
-; RV32IZBAXQCISLS-LABEL: surb:
+; RV32IZBAXQCISLS-LABEL: srh:
 ; RV32IZBAXQCISLS:       # %bb.0:
 ; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
-; RV32IZBAXQCISLS-NEXT:    qc.srb a2, a0, a1, 0
+; RV32IZBAXQCISLS-NEXT:    qc.srh a2, a0, a1, 1
+; RV32IZBAXQCISLS-NEXT:    ret
+  %1 = add i16 %c, %c
+  %2 = getelementptr i16, ptr %a, i32 %b
+  store i16 %1, ptr %2, align 2
+  ret void
+}
+
+define void @srw(ptr %a, i32 %b, i32 %c) {
+; RV32I-LABEL: srw:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a2, a2, a2
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    ret
+;
+; RV32IZBA-LABEL: srw:
+; RV32IZBA:       # %bb.0:
+; RV32IZBA-NEXT:    add a2, a2, a2
+; RV32IZBA-NEXT:    sh2add a0, a1, a0
+; RV32IZBA-NEXT:    sw a2, 0(a0)
+; RV32IZBA-NEXT:    ret
+;
+; RV32IZBAXQCISLS-LABEL: srw:
+; RV32IZBAXQCISLS:       # %bb.0:
+; RV32IZBAXQCISLS-NEXT:    add a2, a2, a2
+; RV32IZBAXQCISLS-NEXT:    qc.srw a2, a0, a1, 2
+; RV32IZBAXQCISLS-NEXT:    ret
+  %1 = add i32 %c, %c
+  %2 = getelementptr i32, ptr %a, i32 %b
+  store i32 %1, ptr %2, align 4
+  ret void
+}
+
+define void @srd(ptr %a, i32 %b, i64 %c) {
+; RV32I-LABEL: srd:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a4, a2, a2
+; RV32I-NEXT:    add a3, a3, a3
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    sltu a2, a4, a2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a2, a3, a2
+; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    ret
+;
+; RV32IZBA-LABEL: srd:
+; RV32IZBA:       # %bb.0:
+; RV32IZBA-NEXT:    add a4, a2, a2
+; RV32IZBA-NEXT:    add a3, a3, a3
+; RV32IZBA-NEXT:    sh3add a0, a1, a0
+; RV32IZBA-NEXT:    sltu a1, a4, a2
+; RV32IZBA-NEXT:    add a1, a3, a1
+; RV32IZBA-NEXT:    sw a4, 0(a0)
+; RV32IZBA-NEXT:    sw a1, 4(a0)
+; RV32IZBA-NEXT:    ret
+;
+; RV32IZBAXQCISLS-LABEL: srd:
+; RV32IZBAXQCISLS:       # %bb.0:
+; RV32IZBAXQCISLS-NEXT:    add a4, a2, a2
+; RV32IZBAXQCISLS-NEXT:    add a3, a3, a3
+; RV32IZBAXQCISLS-NEXT:    sltu a2, a4, a2
+; RV32IZBAXQCISLS-NEXT:    qc.srw a4, a0, a1, 3
+; RV32IZBAXQCISLS-NEXT:    add a2, a3, a2
+; RV32IZBAXQCISLS-NEXT:    addi a0, a0, 4
+; RV32IZBAXQCISLS-NEXT:    qc.srw a2, a0, a1, 3
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = zext i32 %b to i64
-  %2 = add i8 %c, %c
-  %3 = getelementptr i8, ptr %a, i64 %1
-  store i8 %2, ptr %3, align 1
+  %1 = add i64 %c, %c
+  %2 = getelementptr i64, ptr %a, i32 %b
+  store i64 %1, ptr %2, align 8
   ret void
 }
 
-define i64 @lrd_large_shift(ptr %a, i64 %b) {
+define i64 @lrd_large_shift(ptr %a, i32 %b) {
 ; RV32I-LABEL: lrd_large_shift:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 5
@@ -510,9 +508,9 @@ define i64 @lrd_large_shift(ptr %a, i64 %b) {
 ; RV32IZBAXQCISLS-NEXT:    qc.lrw a0, a2, a1, 5
 ; RV32IZBAXQCISLS-NEXT:    qc.lrw a1, a3, a1, 5
 ; RV32IZBAXQCISLS-NEXT:    ret
-  %1 = add i64 %b, 12
-  %2 = shl i64 %1, 2
-  %3 = getelementptr i64, ptr %a, i64 %2
+  %1 = add i32 %b, 12
+  %2 = shl i32 %1, 2
+  %3 = getelementptr i64, ptr %a, i32 %2
   %4 = load i64, ptr %3, align 8
   ret i64 %4
 }
diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index 578f51a957a75..fc20fcb371179 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -858,14 +858,13 @@ define i64 @lurwu(ptr %a, i32 %b) {
 define i64 @lrd(ptr %a, i64 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lrd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    slli a2, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    th.lrw a2, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a2
-; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a0)
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT:    add a0, a2, a2
+; RV32XTHEADMEMIDX-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a1
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lrd:
@@ -908,14 +907,13 @@ define i64 @lrd_2(ptr %a, i64 %b) {
 define i64 @lurd(ptr %a, i32 %b) {
 ; RV32XTHEADMEMIDX-LABEL: lurd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
-; RV32XTHEADMEMIDX-NEXT:    slli a2, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    th.lrw a2, a0, a1, 3
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
 ; RV32XTHEADMEMIDX-NEXT:    th.lrw a1, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a0, a0, a2
-; RV32XTHEADMEMIDX-NEXT:    lw a2, 4(a0)
-; RV32XTHEADMEMIDX-NEXT:    add a0, a1, a1
-; RV32XTHEADMEMIDX-NEXT:    sltu a1, a0, a1
-; RV32XTHEADMEMIDX-NEXT:    add a2, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    add a1, a2, a1
+; RV32XTHEADMEMIDX-NEXT:    add a0, a2, a2
+; RV32XTHEADMEMIDX-NEXT:    sltu a2, a0, a2
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a1
+; RV32XTHEADMEMIDX-NEXT:    add a1, a1, a2
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: lurd:
@@ -1047,11 +1045,10 @@ define void @srd(ptr %a, i64 %b, i64 %c) {
 ; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a3
 ; RV32XTHEADMEMIDX-NEXT:    add a4, a4, a4
 ; RV32XTHEADMEMIDX-NEXT:    sltu a3, a2, a3
-; RV32XTHEADMEMIDX-NEXT:    add a3, a4, a3
-; RV32XTHEADMEMIDX-NEXT:    slli a4, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a4, a0, a4
 ; RV32XTHEADMEMIDX-NEXT:    th.srw a2, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    sw a3, 4(a4)
+; RV32XTHEADMEMIDX-NEXT:    add a3, a4, a3
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
+; RV32XTHEADMEMIDX-NEXT:    th.srw a3, a0, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: srd:
@@ -1071,11 +1068,10 @@ define void @surd(ptr %a, i32 %b, i64 %c) {
 ; RV32XTHEADMEMIDX-NEXT:    add a4, a2, a2
 ; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a3
 ; RV32XTHEADMEMIDX-NEXT:    sltu a2, a4, a2
-; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
-; RV32XTHEADMEMIDX-NEXT:    slli a3, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    add a3, a0, a3
 ; RV32XTHEADMEMIDX-NEXT:    th.srw a4, a0, a1, 3
-; RV32XTHEADMEMIDX-NEXT:    sw a2, 4(a3)
+; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
+; RV32XTHEADMEMIDX-NEXT:    addi a0, a0, 4
+; RV32XTHEADMEMIDX-NEXT:    th.srw a2, a0, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    ret
 ;
 ; RV64XTHEADMEMIDX-LABEL: surd:
diff --git a/llvm/test/CodeGen/SPARC/stack-protector-target.ll b/llvm/test/CodeGen/SPARC/stack-protector-target.ll
new file mode 100644
index 0000000000000..f0e9aa8986ab8
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/stack-protector-target.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc-unknown-linux  < %s | FileCheck -check-prefix=LINUX32 %s
+; RUN: llc -mtriple=sparc64-unknown-linux  < %s | FileCheck -check-prefix=LINUX64 %s
+; RUN: llc -mtriple=sparc-unknown-solaris < %s | FileCheck -check-prefixes=SOLARIS32 %s
+; RUN: llc -mtriple=sparc64-unknown-solaris < %s | FileCheck -check-prefixes=SOLARIS64 %s
+; RUN: llc -mtriple=sparc-unknown-openbsd  < %s | FileCheck -check-prefix=OPENBSD32 %s
+; RUN: llc -mtriple=sparc64-unknown-openbsd  < %s | FileCheck -check-prefix=OPENBSD64 %s
+
+define void @func() sspreq nounwind {
+; LINUX32-LABEL: func:
+; LINUX32:       ! %bb.0:
+; LINUX32-NEXT:    save %sp, -104, %sp
+; LINUX32-NEXT:    ld [%g7+20], %i0
+; LINUX32-NEXT:    st %i0, [%fp+-4]
+; LINUX32-NEXT:    call capture
+; LINUX32-NEXT:    add %fp, -8, %o0
+; LINUX32-NEXT:    ld [%fp+-4], %i0
+; LINUX32-NEXT:    ld [%g7+20], %i1
+; LINUX32-NEXT:    cmp %i1, %i0
+; LINUX32-NEXT:    bne .LBB0_2
+; LINUX32-NEXT:    nop
+; LINUX32-NEXT:  ! %bb.1:
+; LINUX32-NEXT:    ret
+; LINUX32-NEXT:    restore
+; LINUX32-NEXT:  .LBB0_2:
+; LINUX32-NEXT:    call __stack_chk_fail
+; LINUX32-NEXT:    nop
+;
+; LINUX64-LABEL: func:
+; LINUX64:         .register %g7, #ignore
+; LINUX64-NEXT:  ! %bb.0:
+; LINUX64-NEXT:    save %sp, -192, %sp
+; LINUX64-NEXT:    ldx [%g7+40], %i0
+; LINUX64-NEXT:    stx %i0, [%fp+2039]
+; LINUX64-NEXT:    call capture
+; LINUX64-NEXT:    add %fp, 2035, %o0
+; LINUX64-NEXT:    ldx [%fp+2039], %i0
+; LINUX64-NEXT:    ldx [%g7+40], %i1
+; LINUX64-NEXT:    cmp %i1, %i0
+; LINUX64-NEXT:    bne %xcc, .LBB0_2
+; LINUX64-NEXT:    nop
+; LINUX64-NEXT:  ! %bb.1:
+; LINUX64-NEXT:    ret
+; LINUX64-NEXT:    restore
+; LINUX64-NEXT:  .LBB0_2:
+; LINUX64-NEXT:    call __stack_chk_fail
+; LINUX64-NEXT:    nop
+;
+; SOLARIS32-LABEL: func:
+; SOLARIS32:       ! %bb.0:
+; SOLARIS32-NEXT:    save %sp, -104, %sp
+; SOLARIS32-NEXT:    sethi %hi(__stack_chk_guard), %i0
+; SOLARIS32-NEXT:    ld [%i0+%lo(__stack_chk_guard)], %i1
+; SOLARIS32-NEXT:    st %i1, [%fp+-4]
+; SOLARIS32-NEXT:    call capture
+; SOLARIS32-NEXT:    add %fp, -8, %o0
+; SOLARIS32-NEXT:    ld [%i0+%lo(__stack_chk_guard)], %i0
+; SOLARIS32-NEXT:    ld [%fp+-4], %i1
+; SOLARIS32-NEXT:    cmp %i0, %i1
+; SOLARIS32-NEXT:    bne .LBB0_2
+; SOLARIS32-NEXT:    nop
+; SOLARIS32-NEXT:  ! %bb.1:
+; SOLARIS32-NEXT:    ret
+; SOLARIS32-NEXT:    restore
+; SOLARIS32-NEXT:  .LBB0_2:
+; SOLARIS32-NEXT:    call __stack_chk_fail
+; SOLARIS32-NEXT:    nop
+;
+; SOLARIS64-LABEL: func:
+; SOLARIS64:       ! %bb.0:
+; SOLARIS64-NEXT:    save %sp, -192, %sp
+; SOLARIS64-NEXT:    sethi %h44(__stack_chk_guard), %i0
+; SOLARIS64-NEXT:    add %i0, %m44(__stack_chk_guard), %i0
+; SOLARIS64-NEXT:    sllx %i0, 12, %i0
+; SOLARIS64-NEXT:    ldx [%i0+%l44(__stack_chk_guard)], %i1
+; SOLARIS64-NEXT:    stx %i1, [%fp+2039]
+; SOLARIS64-NEXT:    call capture
+; SOLARIS64-NEXT:    add %fp, 2035, %o0
+; SOLARIS64-NEXT:    ldx [%i0+%l44(__stack_chk_guard)], %i0
+; SOLARIS64-NEXT:    ldx [%fp+2039], %i1
+; SOLARIS64-NEXT:    cmp %i0, %i1
+; SOLARIS64-NEXT:    bne %xcc, .LBB0_2
+; SOLARIS64-NEXT:    nop
+; SOLARIS64-NEXT:  ! %bb.1:
+; SOLARIS64-NEXT:    ret
+; SOLARIS64-NEXT:    restore
+; SOLARIS64-NEXT:  .LBB0_2:
+; SOLARIS64-NEXT:    call __stack_chk_fail
+; SOLARIS64-NEXT:    nop
+;
+; OPENBSD32-LABEL: func:
+; OPENBSD32:       ! %bb.0:
+; OPENBSD32-NEXT:    save %sp, -104, %sp
+; OPENBSD32-NEXT:    sethi %hi(__guard_local), %i0
+; OPENBSD32-NEXT:    ld [%i0+%lo(__guard_local)], %i1
+; OPENBSD32-NEXT:    st %i1, [%fp+-4]
+; OPENBSD32-NEXT:    call capture
+; OPENBSD32-NEXT:    add %fp, -8, %o0
+; OPENBSD32-NEXT:    ld [%i0+%lo(__guard_local)], %i0
+; OPENBSD32-NEXT:    ld [%fp+-4], %i1
+; OPENBSD32-NEXT:    cmp %i0, %i1
+; OPENBSD32-NEXT:    bne .LBB0_2
+; OPENBSD32-NEXT:    nop
+; OPENBSD32-NEXT:  ! %bb.1: ! %SP_return
+; OPENBSD32-NEXT:    ret
+; OPENBSD32-NEXT:    restore
+; OPENBSD32-NEXT:  .LBB0_2: ! %CallStackCheckFailBlk
+; OPENBSD32-NEXT:    sethi %hi(.LSSH), %i0
+; OPENBSD32-NEXT:    call __stack_smash_handler
+; OPENBSD32-NEXT:    add %i0, %lo(.LSSH), %o0
+;
+; OPENBSD64-LABEL: func:
+; OPENBSD64:       ! %bb.0:
+; OPENBSD64-NEXT:    save %sp, -192, %sp
+; OPENBSD64-NEXT:    sethi %h44(__guard_local), %i0
+; OPENBSD64-NEXT:    add %i0, %m44(__guard_local), %i0
+; OPENBSD64-NEXT:    sllx %i0, 12, %i0
+; OPENBSD64-NEXT:    ldx [%i0+%l44(__guard_local)], %i1
+; OPENBSD64-NEXT:    stx %i1, [%fp+2039]
+; OPENBSD64-NEXT:    call capture
+; OPENBSD64-NEXT:    add %fp, 2035, %o0
+; OPENBSD64-NEXT:    ldx [%i0+%l44(__guard_local)], %i0
+; OPENBSD64-NEXT:    ldx [%fp+2039], %i1
+; OPENBSD64-NEXT:    cmp %i0, %i1
+; OPENBSD64-NEXT:    bne %xcc, .LBB0_2
+; OPENBSD64-NEXT:    nop
+; OPENBSD64-NEXT:  ! %bb.1: ! %SP_return
+; OPENBSD64-NEXT:    ret
+; OPENBSD64-NEXT:    restore
+; OPENBSD64-NEXT:  .LBB0_2: ! %CallStackCheckFailBlk
+; OPENBSD64-NEXT:    sethi %h44(.LSSH), %i0
+; OPENBSD64-NEXT:    add %i0, %m44(.LSSH), %i0
+; OPENBSD64-NEXT:    sllx %i0, 12, %i0
+; OPENBSD64-NEXT:    call __stack_smash_handler
+; OPENBSD64-NEXT:    add %i0, %l44(.LSSH), %o0
+  %alloca = alloca i32, align 4
+  call void @capture(ptr %alloca)
+  ret void
+}
+
+declare void @capture(ptr)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll
new file mode 100644
index 0000000000000..b18e929568534
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll
@@ -0,0 +1,36 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for refract are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef  <4 x half> @refract_half(<4 x half> noundef  %I, <4 x half> noundef  %N, half noundef  %ETA) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#float_16:]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Refract %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %spv.refract.i = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> %I, <4 x half> %N, half %ETA)
+  ret <4 x half> %spv.refract.i
+}
+
+define noundef  <4 x float> @refract_float4(<4 x float> noundef  %I, <4 x float> noundef  %N, float noundef  %ETA) {
+entry:
+  %conv.i = fpext reassoc nnan ninf nsz arcp afn float %ETA to double
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#float_32:]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Refract %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %spv.refract.i = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> %I, <4 x float> %N, float %ETA)
+  ret <4 x float> %spv.refract.i
+}
+
+declare <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half>, <4 x half>, half)
+declare <4 x float> @llvm.spv.reflect.v4f32.f32(<4 x float>, <4 x float>, float)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll b/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll
new file mode 100644
index 0000000000000..28208fb2e72f8
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.refract), %{{.*}}, %{{.*}}, %{{.*}} is only supported with the GLSL extended instruction set.
+
+define noundef <4 x float> @refract_float4(<4 x float> noundef %I, <4 x float> noundef %N, float noundef %ETA) {
+entry:
+  %spv.refract = call <4 x float> @llvm.spv.refract.f32(<4 x float> %I, <4 x float> %N, float %ETA)
+  ret <4 x float> %spv.refract
+}
+
+declare <4 x float> @llvm.spv.refract.f32(<4 x float>, <4 x float>, float)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
index 0a02a8bf56ace..b179732371d97 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
@@ -1,17 +1,109 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId
-; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
-; CHECK-SPIRV:     %[[#Id:]] = OpVariable %[[#]]
-; CHECK-SPIRV:     %[[#Id:]] = OpVariable %[[#]]
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id0:]] BuiltIn GlobalLinearId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id1:]] BuiltIn GlobalInvocationId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id2:]] BuiltIn LocalInvocationIndex
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id3:]] BuiltIn WorkDim
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id4:]] BuiltIn SubgroupSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id5:]] BuiltIn SubgroupMaxSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id6:]] BuiltIn NumSubgroups
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id7:]] BuiltIn NumEnqueuedSubgroups
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id8:]] BuiltIn SubgroupId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id9:]] BuiltIn SubgroupLocalInvocationId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id10:]] BuiltIn SubgroupEqMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id11:]] BuiltIn SubgroupGeMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id12:]] BuiltIn SubgroupGtMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id13:]] BuiltIn SubgroupLeMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id14:]] BuiltIn SubgroupLtMask
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id15:]] BuiltIn LocalInvocationId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id16:]] BuiltIn WorkgroupSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id17:]] BuiltIn GlobalSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id18:]] BuiltIn WorkgroupId
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id19:]] BuiltIn EnqueuedWorkgroupSize
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id20:]] BuiltIn NumWorkgroups
+; CHECK-SPIRV-DAG: OpDecorate %[[#Id21:]] BuiltIn GlobalOffset
+
+; CHECK-SPIRV:     %[[#Id0:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id1:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id2:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id3:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id4:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id5:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id6:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id7:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id8:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id9:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id10:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id11:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id12:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id13:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id14:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id15:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id16:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id17:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id18:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id19:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id20:]] = OpVariable %[[#]] Input
+; CHECK-SPIRV:     %[[#Id21:]] = OpVariable %[[#]] Input
 
 define spir_kernel void @f() {
 entry:
   %0 = call spir_func i32 @_Z29__spirv_BuiltInGlobalLinearIdv()
   %1 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 1)
+  %2 = call spir_func i64 @_Z35__spirv_BuiltInLocalInvocationIndexv()
+  %3 = call spir_func i32 @_Z22__spirv_BuiltInWorkDimv()
+  %4 = call spir_func i32 @_Z27__spirv_BuiltInSubgroupSizev()
+  %5 = call spir_func i32 @_Z30__spirv_BuiltInSubgroupMaxSizev()
+  %6 = call spir_func i32 @_Z27__spirv_BuiltInNumSubgroupsv()
+  %7 = call spir_func i32 @_Z35__spirv_BuiltInNumEnqueuedSubgroupsv()
+  %8 = call spir_func i32 @_Z25__spirv_BuiltInSubgroupIdv()
+  %9 = call spir_func i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv()
+  %10 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupEqMaskv()
+  %11 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupEqMaskKHRv()
+  %12 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGeMaskv()
+  %13 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGeMaskKHRv()
+  %14 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGtMaskv()
+  %15 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGtMaskKHRv()
+  %16 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLeMaskv()
+  %17 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLeMaskKHRv()
+  %18 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLtMaskv()
+  %19 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLtMaskKHRv()
+  %20 = call spir_func i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 0)
+  %21 = call spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 0)
+  %22 = call spir_func i64 @_Z25__spirv_BuiltInGlobalSizei(i32 0)
+  %23 = call spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 0)
+  %24 = call spir_func i64 @_Z36__spirv_BuiltInEnqueuedWorkgroupSizei(i32 0)
+  %25 = call spir_func i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 0)
+  %26 = call spir_func i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 0)
+
   ret void
 }
 
 declare spir_func i32 @_Z29__spirv_BuiltInGlobalLinearIdv()
 declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32)
+declare spir_func i64 @_Z35__spirv_BuiltInLocalInvocationIndexv()
+declare spir_func i32 @_Z22__spirv_BuiltInWorkDimv()
+declare spir_func i32 @_Z27__spirv_BuiltInSubgroupSizev()
+declare spir_func i32 @_Z30__spirv_BuiltInSubgroupMaxSizev()
+declare spir_func i32 @_Z27__spirv_BuiltInNumSubgroupsv()
+declare spir_func i32 @_Z35__spirv_BuiltInNumEnqueuedSubgroupsv()
+declare spir_func i32 @_Z25__spirv_BuiltInSubgroupIdv()
+declare spir_func i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupEqMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupEqMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGeMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGeMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGtMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGtMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLeMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLeMaskKHRv()
+declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLtMaskv()
+declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLtMaskKHRv()
+declare spir_func i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32)
+declare spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32)
+declare spir_func i64 @_Z25__spirv_BuiltInGlobalSizei(i32)
+declare spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32)
+declare spir_func i64 @_Z36__spirv_BuiltInEnqueuedWorkgroupSizei(i32)
+declare spir_func i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32)
+declare spir_func i64 @_Z27__spirv_BuiltInGlobalOffseti(i32)
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
index a990cee1f5fb3..c9a51f0507844 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
@@ -377,34 +377,30 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1_i1(<2 x i64> %a, <2 x i64> %b, i64 %c) {
 ; CHECK-LABEL: cmpeqz_v2i1_i1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov r2, r3, d2
-; CHECK-NEXT:    orrs r2, r3
-; CHECK-NEXT:    vmov r3, r4, d3
-; CHECK-NEXT:    csetm r12, eq
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    orrs r3, r4
-; CHECK-NEXT:    vmov r4, r3, d0
-; CHECK-NEXT:    csetm r5, eq
-; CHECK-NEXT:    orrs r3, r4
-; CHECK-NEXT:    vmov r3, r4, d1
-; CHECK-NEXT:    csetm lr, eq
-; CHECK-NEXT:    orrs r3, r4
-; CHECK-NEXT:    csetm r4, eq
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    beq .LBB15_2
-; CHECK-NEXT:  @ %bb.1: @ %select.false
-; CHECK-NEXT:    bfi r2, r12, #0, #8
-; CHECK-NEXT:    bfi r2, r5, #8, #8
+; CHECK-NEXT:  @ %bb.1: @ %select.false.sink
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    mov.w r1, #0
+; CHECK-NEXT:    csetm r0, eq
+; CHECK-NEXT:    bfi r1, r0, #0, #8
+; CHECK-NEXT:    vmov r0, r2, d3
 ; CHECK-NEXT:    b .LBB15_3
-; CHECK-NEXT:  .LBB15_2:
-; CHECK-NEXT:    bfi r2, lr, #0, #8
-; CHECK-NEXT:    bfi r2, r4, #8, #8
+; CHECK-NEXT:  .LBB15_2: @ %select.true.sink
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    mov.w r1, #0
+; CHECK-NEXT:    csetm r0, eq
+; CHECK-NEXT:    bfi r1, r0, #0, #8
+; CHECK-NEXT:    vmov r0, r2, d1
 ; CHECK-NEXT:  .LBB15_3: @ %select.end
-; CHECK-NEXT:    vmsr p0, r2
+; CHECK-NEXT:    orrs r0, r2
+; CHECK-NEXT:    csetm r0, eq
+; CHECK-NEXT:    bfi r1, r0, #8, #8
+; CHECK-NEXT:    vmsr p0, r1
 ; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    bx lr
 entry:
   %c1 = icmp eq <2 x i64> %a, zeroinitializer
   %c2 = icmp eq <2 x i64> %b, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-1.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-1.ll
index 23924b2fb2f8c..dc06c268b0ef4 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-1.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-1.ll
@@ -1,9 +1,9 @@
-; RUN: llc --force-dwarf-frame-section --exception-model=arm %s -o - | FileCheck %s
-; RUN: llc --filetype=obj %s --exception-model=arm -o - | llvm-readelf -s --unwind - | FileCheck %s --check-prefix=UNWIND
+; RUN: llc --force-dwarf-frame-section --exception-model=arm %s -o - --target-abi=aapcs16 | FileCheck %s
+; RUN: llc --filetype=obj %s --exception-model=arm -o - --target-abi=aapcs16 | llvm-readelf -s --unwind - | FileCheck %s --check-prefix=UNWIND
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; Triple tweaked so we get 16-byte stack alignment and better test coverage.
-target triple = "armv7m-none-nacl-android"
+target triple = "armv7m-none-linux-android"
 
 ; -Oz
 ; volatile int a, b, c, d, e, f, g, h, i;
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index f6d66ab47ce05..2911edfbfd409 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -367,44 +367,49 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -438,44 +443,49 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -639,55 +649,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovll %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovll %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovll 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovll 36(%ebp), %ebx
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -848,37 +862,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovgel (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovgel %ebx, %esi
-; X86-NEXT:    cmovgel %ebp, %ecx
-; X86-NEXT:    cmovgel %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %ebx
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel %edi, %esi
+; X86-NEXT:    cmovgel %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1118,35 +1136,39 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %esi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1175,35 +1197,39 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %esi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 0356c2702a419..a1a4ba81ae493 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -343,37 +343,41 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -404,37 +408,41 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -585,37 +593,41 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -768,37 +780,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1027,35 +1043,38 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128:
@@ -1079,35 +1098,38 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128_undef:
@@ -1282,37 +1304,41 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 6bda99c89a37e..b7c34070f1af6 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -355,39 +355,43 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -423,39 +427,43 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -621,55 +629,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovbl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovbl %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovbl 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovbl %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovbl 36(%ebp), %ebx
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -827,39 +839,43 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 27acec32fd348..043c9155f52f9 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -326,35 +326,38 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128:
@@ -381,35 +384,38 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128_undef:
@@ -548,35 +554,38 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_minmax_i128:
@@ -717,35 +726,38 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_cmp_i128:
@@ -887,35 +899,38 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_select_i128:
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index bae140abdf6b1..e252d5953e60e 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -144,31 +144,34 @@ define i128 @test_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call i128 @llvm.abs.i128(i128 %a, i1 false)
   ret i128 %r
@@ -688,13 +691,17 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_sextinreg_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    subl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
@@ -702,7 +709,9 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %shl = shl i128 %a, 64
   %ashr = ashr exact i128 %shl, 64
diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index c2bfcf57185e3..1df284fb9fe2c 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -104,18 +104,21 @@ define i24 @test_i24_add_add_idx(i24 %x, i24 %y, i24 %z) nounwind {
 define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-LABEL: test_i128_add_add_idx:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    btl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    addl 24(%ebp), %esi
+; X86-NEXT:    adcl 28(%ebp), %edi
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl 36(%ebp), %edx
+; X86-NEXT:    btl $5, 64(%ebp)
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -124,8 +127,10 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128_add_add_idx:
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index 0eb2c630e6818..f13627b55856f 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -188,11 +188,11 @@ define void @split_i128(ptr %sret, i128 %x) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $48, %esp
-; CHECK-NEXT:    movl 12(%ebp), %eax
+; CHECK-NEXT:    movl 24(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%ebp), %ebx
-; CHECK-NEXT:    movl 20(%ebp), %esi
-; CHECK-NEXT:    movl 24(%ebp), %edi
+; CHECK-NEXT:    movl 28(%ebp), %ebx
+; CHECK-NEXT:    movl 32(%ebp), %esi
+; CHECK-NEXT:    movl 36(%ebp), %edi
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index f66f0c0ceabc4..cc58bc1e44f37 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -628,13 +628,19 @@ define half @s128_to_half(i128 %x) {
 ;
 ; X86-LABEL: s128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floattihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = sitofp i128 %x to half
   ret half %a
@@ -713,13 +719,19 @@ define half @u128_to_half(i128 %x) {
 ;
 ; X86-LABEL: u128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floatuntihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = uitofp i128 %x to half
   ret half %a
@@ -1020,11 +1032,15 @@ define half @f128_to_half(fp128 %x) nounwind {
 ;
 ; X86-LABEL: f128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __trunctfhf2
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a = fptrunc fp128 %x to half
   ret half %a
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 4fc0827ac4dd6..33381313d3c19 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -146,37 +146,40 @@ define i64 @bitselect_i64(i64 %a, i64 %b, i64 %m) nounwind {
 define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ; X86-LABEL: bitselect_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    andl 56(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    andl 60(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    andl 64(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    andl 68(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-NOBMI-LABEL: bitselect_i128:
diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 312f94c041235..143e10e6909e4 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -263,70 +263,78 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    rep bsfl %edi, %esi
-; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB8_7
-; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    rep bsfl %eax, %edx
-; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    jmp .LBB8_5
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    rep bsfl %ecx, %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:  .LBB8_5: # %cond.false
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
-; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:  # %bb.7: # %cond.false
+; X86-NEXT:    rep bsfl %ebx, %edx
 ; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %edi, %edx
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    jne .LBB8_13
-; X86-NEXT:  # %bb.12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:  .LBB8_13: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB8_12
+; X86-NEXT:  # %bb.13: # %cond.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB8_14: # %cond.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -361,46 +369,49 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    je .LBB9_11
 ; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
 ; X86-NEXT:  # %bb.3: # %select.true.sink
-; X86-NEXT:    rep bsfl %ecx, %edi
-; X86-NEXT:    addl $32, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %ecx, %ebx
+; X86-NEXT:    addl $32, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB9_6
 ; X86-NEXT:  .LBB9_5:
-; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    rep bsfl %edi, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    je .LBB9_8
 ; X86-NEXT:    jmp .LBB9_9
 ; X86-NEXT:  .LBB9_11: # %select.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    jmp .LBB9_10
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    rep bsfl %edx, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %edx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB9_5
 ; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
@@ -409,13 +420,14 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:  .LBB9_9: # %select.true.sink
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
 ; X86-NEXT:  .LBB9_10: # %select.true.sink
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index fbca4af425eac..ab0478a4e944b 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -291,79 +291,80 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB8_7
 ; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    jmp .LBB8_8
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl $128, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl $128, %esi
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    bsrl %ebp, %edx
-; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
 ; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    bsrl %ecx, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    orl 36(%ebp), %edx
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
-; X86-NEXT:    orl $64, %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl 36(%ebp), %edi
 ; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    je .LBB8_12
 ; X86-NEXT:  # %bb.13: # %cond.end
-; X86-NEXT:    xorl $127, %edx
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    jmp .LBB8_14
 ; X86-NEXT:  .LBB8_12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:  .LBB8_14: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -398,62 +399,67 @@ define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    bsrl %esi, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
-; X86-NEXT:    bsrl %edi, %ecx
-; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    bsrl %ebx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    orl $32, %ebp
-; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    je .LBB9_7
+; X86-NEXT:    jmp .LBB9_8
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    bsrl %edx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:  .LBB9_6:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    jne .LBB9_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    orl $64, %ebp
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:  .LBB9_7:
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:  .LBB9_8:
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl 32(%ebp), %ebx
 ; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  # %bb.10:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    jmp .LBB9_11
 ; X86-NEXT:  .LBB9_9:
-; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 6d5e995a6d574..673b7f16de75c 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -41,13 +41,16 @@ define i64 @bswap_i64(i64 %a0) nounwind {
 define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-LABEL: bswap_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    bswapl %edi
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    bswapl %edx
@@ -56,25 +59,32 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-MOVBE-LABEL: bswap_i128:
 ; X86-MOVBE:       # %bb.0:
+; X86-MOVBE-NEXT:    pushl %ebp
+; X86-MOVBE-NEXT:    movl %esp, %ebp
 ; X86-MOVBE-NEXT:    pushl %edi
 ; X86-MOVBE-NEXT:    pushl %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-MOVBE-NEXT:    andl $-16, %esp
+; X86-MOVBE-NEXT:    movl 8(%ebp), %eax
+; X86-MOVBE-NEXT:    movl 32(%ebp), %ecx
+; X86-MOVBE-NEXT:    movl 36(%ebp), %edx
+; X86-MOVBE-NEXT:    movl 24(%ebp), %esi
+; X86-MOVBE-NEXT:    movl 28(%ebp), %edi
 ; X86-MOVBE-NEXT:    movbel %esi, 12(%eax)
 ; X86-MOVBE-NEXT:    movbel %edi, 8(%eax)
 ; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
 ; X86-MOVBE-NEXT:    movbel %edx, (%eax)
+; X86-MOVBE-NEXT:    leal -8(%ebp), %esp
 ; X86-MOVBE-NEXT:    popl %esi
 ; X86-MOVBE-NEXT:    popl %edi
+; X86-MOVBE-NEXT:    popl %ebp
 ; X86-MOVBE-NEXT:    retl $4
 ;
 ; X64-LABEL: bswap_i128:
diff --git a/llvm/test/CodeGen/X86/constructor.ll b/llvm/test/CodeGen/X86/constructor.ll
index f46325db8c19e..dca62acff6ed5 100644
--- a/llvm/test/CodeGen/X86/constructor.ll
+++ b/llvm/test/CodeGen/X86/constructor.ll
@@ -5,7 +5,6 @@
 ; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s
 ; RUN: llc -mtriple x86_64-unknown-freebsd < %s | FileCheck --check-prefix=INIT-ARRAY %s
 ; RUN: llc -mtriple x86_64-pc-solaris2.11 < %s | FileCheck --check-prefix=INIT-ARRAY %s
-; RUN: llc -mtriple x86_64-unknown-nacl < %s | FileCheck --check-prefix=NACL %s
 ; RUN: llc -mtriple i586-intel-elfiamcu -use-ctors < %s | FileCheck %s --check-prefix=MCU-CTORS
 ; RUN: llc -mtriple i586-intel-elfiamcu < %s | FileCheck %s --check-prefix=MCU-INIT-ARRAY
 ; RUN: llc -mtriple x86_64-win32-gnu < %s | FileCheck --check-prefix=COFF-CTOR %s
@@ -62,18 +61,6 @@ entry:
 ; INIT-ARRAY-NEXT:	.quad	i
 ; INIT-ARRAY-NEXT:	.quad	j
 
-; NACL:		.section	.init_array.15,"awG",@init_array,v,comdat
-; NACL-NEXT:	.p2align	2
-; NACL-NEXT:	.long	g
-; NACL-NEXT:	.section	.init_array.55555,"awG",@init_array,v,comdat
-; NACL-NEXT:	.p2align	2
-; NACL-NEXT:	.long	h
-; NACL-NEXT:	.section	.init_array,"aw",@init_array
-; NACL-NEXT:	.p2align	2
-; NACL-NEXT:	.long	f
-; NACL-NEXT:	.long	i
-; NACL-NEXT:	.long	j
-
 ; MCU-CTORS:         .section        .ctors,"aw",@progbits
 ; MCU-INIT-ARRAY:    .section        .init_array,"aw",@init_array
 
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d869f8ec01a5a..661e7bb19641c 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,17 +152,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -172,16 +172,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ebx
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %ebx
@@ -488,13 +488,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 56(%ebp), %ecx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl %ebx, 8(%ecx)
 ; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -508,7 +508,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -523,17 +523,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
@@ -543,13 +543,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 7bbddefd82721..370e1c608e44f 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,26 +152,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    orl 36(%ebp), %ecx
+; X86-NEXT:    orl 48(%ebp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    orl 24(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    orl 20(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl 36(%ebp), %ecx
+; X86-NEXT:    bsrl 48(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
@@ -184,325 +184,310 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %eax
 ; X86-NEXT:    addl $64, %eax
-; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl 12(%ebp), %edx
+; X86-NEXT:    bsrl 24(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl 20(%ebp), %edi
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT:    movl $127, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    sete %al
-; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movb %cl, %ah
-; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    cmovnel %esi, %ebx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovnel %esi, %ecx
+; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    movl 12(%ebp), %edi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    orb %ah, %al
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    jne .LBB4_7
-; X86-NEXT:  # %bb.1: # %udiv-bb1
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    cmovnel %edi, %eax
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    cmovnel %edi, %edx
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    cmovnel %edi, %ebx
+; X86-NEXT:    movl 56(%ebp), %edi
+; X86-NEXT:    jne .LBB4_8
+; X86-NEXT:  # %bb.1: # %_udiv-special-cases
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 56(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    je .LBB4_8
+; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %edi
-; X86-NEXT:    movl 140(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 136(%esp,%eax), %esi
+; X86-NEXT:    movl 140(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 128(%esp,%eax), %ebx
-; X86-NEXT:    movl 132(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    movl 132(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    shldl %cl, %ebx, %edx
 ; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    addl $1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 20(%ebp), %ebx
-; X86-NEXT:    jae .LBB4_2
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    jmp .LBB4_6
-; X86-NEXT:  .LBB4_2: # %udiv-preheader
+; X86-NEXT:    jae .LBB4_3
+; X86-NEXT:  # %bb.6:
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    jmp .LBB4_7
+; X86-NEXT:  .LBB4_3: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%eax), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edi
-; X86-NEXT:    movl 84(%esp,%eax), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %edx
+; X86-NEXT:    movl 84(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %esi
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    .p2align 4
-; X86-NEXT:  .LBB4_3: # %udiv-do-while
+; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl 52(%ebp), %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 40(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 36(%ebp), %eax
+; X86-NEXT:    andl 48(%ebp), %eax
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl 32(%ebp), %edx
-; X86-NEXT:    andl 28(%ebp), %ecx
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    andl 44(%ebp), %edx
+; X86-NEXT:    andl 40(%ebp), %ecx
+; X86-NEXT:    subl %ecx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:  .LBB4_6: # %udiv-loop-exit
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    shldl $1, %esi, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:  .LBB4_7: # %udiv-end
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl 56(%ebp), %edi
+; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:  .LBB4_8: # %udiv-end
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, (%edi)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    movl %esi, 8(%edi)
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %edx, %esi
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    imull %ebx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull 28(%ebp), %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    imull %edx, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull 40(%ebp), %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
diff --git a/llvm/test/CodeGen/X86/fast-isel-x32.ll b/llvm/test/CodeGen/X86/fast-isel-x32.ll
index 23f6304c88d28..e01cebecdbb0a 100644
--- a/llvm/test/CodeGen/X86/fast-isel-x32.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-x32.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel -fast-isel-abort=1 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 ; Test that alloca addresses are materialized with the right size instruction.
 
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 707b05f3478db..bb5640aeb66fa 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -481,18 +481,21 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -501,7 +504,7 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -620,18 +623,21 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -640,7 +646,7 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -818,18 +824,21 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -838,7 +847,7 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -1016,18 +1025,21 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1036,7 +1048,7 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484d47ba1..6d4ec063ccd46 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -415,16 +415,20 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-LABEL: TestFPToSIF128_I128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -432,7 +436,7 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-NEXT:    movl %edx, vi128+8
 ; X86-NEXT:    movl %ecx, vi128+4
 ; X86-NEXT:    movl %eax, vi128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -466,16 +470,20 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-LABEL: TestFPToUIF128_U128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -483,7 +491,7 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-NEXT:    movl %edx, vu128+8
 ; X86-NEXT:    movl %ecx, vu128+4
 ; X86-NEXT:    movl %eax, vu128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -913,16 +921,20 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-LABEL: TestSIToFPI128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vi128, %eax
+; X86-NEXT:    movl vi128+4, %ecx
+; X86-NEXT:    movl vi128+8, %edx
+; X86-NEXT:    movl vi128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vi128+12
-; X86-NEXT:    pushl vi128+8
-; X86-NEXT:    pushl vi128+4
-; X86-NEXT:    pushl vi128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -930,7 +942,7 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -964,16 +976,20 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-LABEL: TestUIToFPU128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vu128, %eax
+; X86-NEXT:    movl vu128+4, %ecx
+; X86-NEXT:    movl vu128+8, %edx
+; X86-NEXT:    movl vu128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vu128+12
-; X86-NEXT:    pushl vu128+8
-; X86-NEXT:    pushl vu128+4
-; X86-NEXT:    pushl vu128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,7 +997,7 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1134,33 +1150,30 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ;
 ; X86-LABEL: TestBits128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    orl (%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestBits128:
@@ -1359,12 +1372,14 @@ define i1 @PR34866(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866:
@@ -1394,12 +1409,14 @@ define i1 @PR34866_commute(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866_commute:
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index a7eea04181f60..ad2d690fd7ed0 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -41,27 +41,40 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: add:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: add:
@@ -81,24 +94,32 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -107,9 +128,10 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -141,27 +163,40 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: sub:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sub:
@@ -181,24 +216,32 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -207,9 +250,10 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,27 +285,40 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: mul:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: mul:
@@ -281,24 +338,32 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -307,9 +372,10 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -341,27 +407,40 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: div:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: div:
@@ -381,24 +460,32 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -407,9 +494,10 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -434,31 +522,48 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ;
 ; X86-LABEL: fma:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: fma:
@@ -481,28 +586,40 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -511,9 +628,10 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -538,27 +656,40 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: frem:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: frem:
@@ -578,24 +709,32 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -604,9 +743,10 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -631,23 +771,28 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: ceil:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: ceil:
@@ -667,17 +812,20 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -713,23 +861,28 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: acos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: acos:
@@ -749,17 +902,20 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -795,23 +951,28 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cos:
@@ -831,17 +992,20 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -877,23 +1041,28 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cosh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cosh:
@@ -913,17 +1082,20 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -959,23 +1131,28 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll expf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp:
@@ -995,17 +1172,20 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _expl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1041,23 +1221,28 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll exp2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp2:
@@ -1077,17 +1262,20 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _exp2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1123,23 +1311,28 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: floor:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: floor:
@@ -1159,17 +1352,20 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1205,23 +1401,28 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll logf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log:
@@ -1241,17 +1442,20 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _logl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1287,23 +1491,28 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log10:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log10f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log10:
@@ -1323,17 +1532,20 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log10l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1369,23 +1581,28 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log2:
@@ -1405,17 +1622,20 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1451,27 +1671,40 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: maxnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaxf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: maxnum:
@@ -1491,24 +1724,32 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmaxl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1517,9 +1758,10 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1544,27 +1786,40 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: minnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fminf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: minnum:
@@ -1584,24 +1839,32 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fminl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1610,9 +1873,10 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1637,23 +1901,28 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: nearbyint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: nearbyint:
@@ -1673,17 +1942,20 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1719,27 +1991,40 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: pow:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll powf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: pow:
@@ -1759,24 +2044,32 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _powl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1785,9 +2078,10 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1819,24 +2113,32 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ;
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __powitf2
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: powi:
@@ -1853,21 +2155,26 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___powitf2
-; WIN-X86-NEXT:    addl $24, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1876,9 +2183,10 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1903,23 +2211,28 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: rint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: rint:
@@ -1939,17 +2252,20 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1985,23 +2301,28 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: round:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: round:
@@ -2021,17 +2342,20 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2067,23 +2391,28 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: roundeven:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundevenf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: roundeven:
@@ -2103,17 +2432,20 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundevenl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2149,23 +2481,28 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: asin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: asin:
@@ -2185,17 +2522,20 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2231,23 +2571,28 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sin:
@@ -2267,17 +2612,20 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2313,23 +2661,28 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sinh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sinh:
@@ -2349,17 +2702,20 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2395,23 +2751,28 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sqrt:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sqrt:
@@ -2431,17 +2792,20 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2477,23 +2841,28 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: atan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan:
@@ -2513,17 +2882,20 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2559,27 +2931,40 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: atan2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan2:
@@ -2599,24 +2984,32 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2625,9 +3018,10 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2652,23 +3046,28 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tan:
@@ -2688,17 +3087,20 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2734,23 +3136,28 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tanh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tanh:
@@ -2770,17 +3177,20 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2816,23 +3226,28 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: trunc:
@@ -2852,17 +3267,20 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2919,12 +3337,18 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -2969,12 +3393,18 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -3019,12 +3449,18 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3069,12 +3505,18 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3176,26 +3618,32 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmp:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB37_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB37_3
 ; WIN-X86-NEXT:  LBB37_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB37_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmp.f128(
                                                fp128 %x, fp128 %y,
@@ -3300,26 +3748,32 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmps:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB38_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB38_3
 ; WIN-X86-NEXT:  LBB38_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB38_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmps.f128(
                                                fp128 %x, fp128 %y,
@@ -3496,44 +3950,47 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_ueq_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    sete %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    orb %bl, %al
 ; WIN-X86-NEXT:    jne LBB39_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB39_3
 ; WIN-X86-NEXT:  LBB39_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB39_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
@@ -3716,32 +4173,34 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_one_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    setne %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
@@ -3749,13 +4208,14 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    testb %bl, %al
 ; WIN-X86-NEXT:    jne LBB40_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB40_3
 ; WIN-X86-NEXT:  LBB40_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB40_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index f727a79078627..4b0449fd7502e 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -42,22 +42,38 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Add:
@@ -78,22 +94,31 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -101,8 +126,10 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -144,22 +171,38 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Add:
@@ -180,22 +223,31 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -203,8 +255,10 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,22 +295,38 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sub:
@@ -277,22 +347,31 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -300,8 +379,10 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -343,22 +424,38 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Sub:
@@ -379,22 +476,31 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -402,8 +508,10 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -440,22 +548,38 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Mul:
@@ -476,22 +600,31 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -499,8 +632,10 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -542,22 +677,38 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Mul:
@@ -578,22 +729,31 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -601,8 +761,10 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -639,22 +801,38 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Div:
@@ -675,22 +853,31 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -698,8 +885,10 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -741,22 +930,38 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Div:
@@ -777,22 +982,31 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -800,8 +1014,10 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -830,22 +1046,38 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rem:
@@ -866,22 +1098,31 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -889,8 +1130,10 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -922,22 +1165,38 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Rem:
@@ -958,22 +1217,31 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,8 +1249,10 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1011,18 +1281,24 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sqrt:
@@ -1042,16 +1318,19 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1089,18 +1368,24 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sin:
@@ -1120,16 +1405,19 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1167,18 +1455,24 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Cos:
@@ -1198,16 +1492,19 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1245,18 +1542,24 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Ceil:
@@ -1276,16 +1579,19 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1323,18 +1629,24 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Floor:
@@ -1354,16 +1666,19 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1401,18 +1716,24 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Trunc:
@@ -1432,16 +1753,19 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1479,18 +1803,24 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Nearbyint:
@@ -1510,16 +1840,19 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1557,18 +1890,24 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rint:
@@ -1588,16 +1927,19 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1635,18 +1977,24 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Round:
@@ -1666,16 +2014,19 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1705,31 +2056,48 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128FMA:
@@ -1752,28 +2120,40 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1782,9 +2162,10 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1804,23 +2185,28 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Acos:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Acos:
@@ -1840,17 +2226,20 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1879,23 +2268,28 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Asin:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Asin:
@@ -1915,17 +2309,20 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1954,23 +2351,28 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Atan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan:
@@ -1990,17 +2392,20 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2029,27 +2434,40 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ;
 ; X86-LABEL: Test128Atan2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan2:
@@ -2069,24 +2487,32 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2095,9 +2521,10 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
@@ -2115,23 +2542,28 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Cosh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Cosh:
@@ -2151,17 +2583,20 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2190,23 +2625,28 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Sinh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Sinh:
@@ -2226,17 +2666,20 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2265,23 +2708,28 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tan:
@@ -2301,17 +2749,20 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2340,23 +2791,28 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tanh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tanh:
@@ -2376,17 +2832,20 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2425,27 +2884,34 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Modf:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll modff128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
 ; X86-NEXT:    movaps %xmm1, 16(%esi)
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    addl $80, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Modf:
@@ -2468,18 +2934,21 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    subl $112, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    pushl %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _modfl
-; WIN-X86-NEXT:    addl $24, %esp
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/frameaddr.ll b/llvm/test/CodeGen/X86/frameaddr.ll
index 7f2f9b8bd6fd0..33c9e3855c108 100644
--- a/llvm/test/CodeGen/X86/frameaddr.ll
+++ b/llvm/test/CodeGen/X86/frameaddr.ll
@@ -5,8 +5,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK-64
 ; RUN: llc < %s -mtriple=x86_64-gnux32                    | FileCheck %s --check-prefix=CHECK-X32ABI
 ; RUN: llc < %s -mtriple=x86_64-gnux32 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK-X32ABI
-; RUN: llc < %s -mtriple=x86_64-nacl                    | FileCheck %s --check-prefix=CHECK-NACL64
-; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK-NACL64
 
 define ptr @test1() nounwind {
 entry:
@@ -34,10 +32,6 @@ entry:
 ; CHECK-X32ABI-NEXT:  movl %ebp, %eax
 ; CHECK-X32ABI-NEXT:  popq %rbp
 ; CHECK-X32ABI-NEXT:  ret
-; CHECK-NACL64-LABEL: test1
-; CHECK-NACL64:       pushq %rbp
-; CHECK-NACL64-NEXT:  movq %rsp, %rbp
-; CHECK-NACL64-NEXT:  movl %ebp, %eax
   %0 = tail call ptr @llvm.frameaddress(i32 0)
   ret ptr %0
 }
@@ -71,11 +65,6 @@ entry:
 ; CHECK-X32ABI-NEXT:  movl (%eax), %eax
 ; CHECK-X32ABI-NEXT:  popq %rbp
 ; CHECK-X32ABI-NEXT:  ret
-; CHECK-NACL64-LABEL: test2
-; CHECK-NACL64:       pushq %rbp
-; CHECK-NACL64-NEXT:  movq %rsp, %rbp
-; CHECK-NACL64-NEXT:  movl (%ebp), %eax
-; CHECK-NACL64-NEXT:  movl (%eax), %eax
   %0 = tail call ptr @llvm.frameaddress(i32 2)
   ret ptr %0
 }
diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll
index 8602c385af834..bc9e29957c74a 100644
--- a/llvm/test/CodeGen/X86/freeze-unary.ll
+++ b/llvm/test/CodeGen/X86/freeze-unary.ll
@@ -70,6 +70,86 @@ define <2 x i64> @freeze_zext_vec(<2 x i16> %a0) nounwind {
   ret <2 x i64> %z
 }
 
+define i32 @freeze_abs(i32 %a0) nounwind {
+; X86-LABEL: freeze_abs:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_abs:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    retq
+  %x = call i32 @llvm.abs.i32(i32 %a0, i1 0)
+  %f = freeze i32 %x
+  %r = call i32 @llvm.abs.i32(i32 %f, i1 0)
+  ret i32 %r
+}
+
+define <4 x i32> @freeze_abs_vec(<4 x i32> %a0) nounwind {
+; X86-LABEL: freeze_abs_vec:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrad $31, %xmm1
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    psubd %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_abs_vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pabsd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %x = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a0, i1 0)
+  %f = freeze <4 x i32> %x
+  %r = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %f, i1 0)
+  ret <4 x i32> %r
+}
+
+define i32 @freeze_abs_undef(i32 %a0) nounwind {
+; X86-LABEL: freeze_abs_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_abs_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    retq
+  %x = call i32 @llvm.abs.i32(i32 %a0, i1 -1)
+  %f = freeze i32 %x
+  %r = call i32 @llvm.abs.i32(i32 %f, i1 -1)
+  ret i32 %r
+}
+
+define <4 x i32> @freeze_abs_undef_vec(<4 x i32> %a0) nounwind {
+; X86-LABEL: freeze_abs_undef_vec:
+; X86:       # %bb.0:
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrad $31, %xmm1
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    psubd %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_abs_undef_vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pabsd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %x = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a0, i1 -1)
+  %f = freeze <4 x i32> %x
+  %r = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %f, i1 -1)
+  ret <4 x i32> %r
+}
+
 define i32 @freeze_bswap(i32 %a0) nounwind {
 ; X86-LABEL: freeze_bswap:
 ; X86:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index e8c8ccfa8d37f..ec1b8a3c8d6d9 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -264,53 +264,62 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %edi
+; X86-FAST-NEXT:    movl 28(%ebp), %edx
+; X86-FAST-NEXT:    movl 48(%ebp), %esi
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %eax
 ; X86-FAST-NEXT:    jne .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %ebx, %ebp
 ; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl %edi, %eax
-; X86-FAST-NEXT:    movl %edx, %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    movl 32(%ebp), %edi
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %eax
+; X86-FAST-NEXT:    movl 36(%ebp), %edx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:    movl %edi, %esi
-; X86-FAST-NEXT:    movl %ebx, %edi
-; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-FAST-NEXT:    jmp .LBB6_6
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl 44(%ebp), %ebx
+; X86-FAST-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl 40(%ebp), %ebx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_4
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    movl %eax, %ebp
+; X86-FAST-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-FAST-NEXT:  .LBB6_6:
-; X86-FAST-NEXT:    movl %ebx, %eax
-; X86-FAST-NEXT:    shldl %cl, %ebp, %eax
-; X86-FAST-NEXT:    movl %edi, %ebp
-; X86-FAST-NEXT:    shldl %cl, %ebx, %ebp
-; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    shldl %cl, %edi, %ebx
+; X86-FAST-NEXT:    movl %esi, %edi
+; X86-FAST-NEXT:    shldl %cl, %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %edx
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    shldl %cl, %esi, %ebx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    shldl %cl, %edx, %esi
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shldl %cl, %esi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movl %edx, 12(%ecx)
-; X86-FAST-NEXT:    movl %ebx, 8(%ecx)
-; X86-FAST-NEXT:    movl %ebp, 4(%ecx)
-; X86-FAST-NEXT:    movl %eax, (%ecx)
-; X86-FAST-NEXT:    movl %ecx, %eax
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    shldl %cl, %eax, %edx
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %edx, 12(%eax)
+; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    movl %ebx, 4(%eax)
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -320,77 +329,91 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    pushl %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $32, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %esi
+; X86-SLOW-NEXT:    movl 28(%ebp), %eax
+; X86-SLOW-NEXT:    movl 48(%ebp), %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %ebx, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    movl 32(%ebp), %esi
 ; X86-SLOW-NEXT:    movl %edi, %ecx
-; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %al
-; X86-SLOW-NEXT:    je .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, %ebx
-; X86-SLOW-NEXT:    movl %edx, %edi
-; X86-SLOW-NEXT:    movl %ecx, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl 36(%ebp), %eax
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $32, %al
+; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
+; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    jne .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ecx, %ebp
-; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    movl %edx, %esi
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %ebp
-; X86-SLOW-NEXT:    movb %al, %ch
-; X86-SLOW-NEXT:    notb %ch
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    orl %esi, %ebp
-; X86-SLOW-NEXT:    movl %edi, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %edx
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    orl %esi, %edx
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl %ebx, %edi
 ; X86-SLOW-NEXT:    shrl %edi
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    notb %bl
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    orl %esi, %edi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %edx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    orl %eax, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebx, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl %ebx
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shrl %cl, %ebx
 ; X86-SLOW-NEXT:    orl %eax, %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl 8(%ebp), %eax
 ; X86-SLOW-NEXT:    movl %ebx, 12(%eax)
-; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    movl %esi, 8(%eax)
 ; X86-SLOW-NEXT:    movl %edx, 4(%eax)
-; X86-SLOW-NEXT:    movl %ebp, (%eax)
-; X86-SLOW-NEXT:    addl $4, %esp
+; X86-SLOW-NEXT:    movl %edi, (%eax)
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 4340f8fd484ae..544ab7fc77374 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -258,51 +258,53 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    pushl %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %esi
+; X86-FAST-NEXT:    movl 28(%ebp), %eax
+; X86-FAST-NEXT:    movl 48(%ebp), %edx
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %ebx
 ; X86-FAST-NEXT:    je .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl %esi, %ebp
-; X86-FAST-NEXT:    movl %ebx, %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %edx
+; X86-FAST-NEXT:    movl 32(%ebp), %esi
+; X86-FAST-NEXT:    movl %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl 36(%ebp), %eax
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_4
 ; X86-FAST-NEXT:    jmp .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl 40(%ebp), %edi
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl 44(%ebp), %edi
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %edi, %ebx
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    movl %edx, %esi
-; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl %ebx, %esi
+; X86-FAST-NEXT:    movl %edx, %ebx
+; X86-FAST-NEXT:    movl %edi, %edx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    shrdl %cl, %edx, %ebp
-; X86-FAST-NEXT:    shrdl %cl, %esi, %edx
-; X86-FAST-NEXT:    shrdl %cl, %edi, %esi
+; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
+; X86-FAST-NEXT:    shrdl %cl, %ebx, %edx
+; X86-FAST-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shrdl %cl, %ebx, %edi
-; X86-FAST-NEXT:    movl %edi, 12(%eax)
-; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    shrdl %cl, %eax, %esi
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %esi, 12(%eax)
+; X86-FAST-NEXT:    movl %ebx, 8(%eax)
 ; X86-FAST-NEXT:    movl %edx, 4(%eax)
-; X86-FAST-NEXT:    movl %ebp, (%eax)
-; X86-FAST-NEXT:    addl $4, %esp
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -312,78 +314,88 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $16, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %edx
+; X86-SLOW-NEXT:    movl 28(%ebp), %esi
+; X86-SLOW-NEXT:    movl 48(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 56(%ebp), %eax
+; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %ebx, %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl %edi, %edx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %ebx
+; X86-SLOW-NEXT:    movl 32(%ebp), %edx
+; X86-SLOW-NEXT:    movl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %cl
-; X86-SLOW-NEXT:    jne .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebp, %edi
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl 36(%ebp), %esi
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl 40(%ebp), %eax
+; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %eax
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    je .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %edx, %esi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ebx
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    notb %bl
-; X86-SLOW-NEXT:    leal (%ebp,%ebp), %eax
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    orl %edx, %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movl %eax, %edx
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    notb %al
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    addl %ebx, %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl %edx, %ebx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    leal (%edi,%edi), %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%ebx,%ebx), %edx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    orl %ebp, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    orl %edi, %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %ebp
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%edi,%edi), %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl 8(%ebp), %ecx
 ; X86-SLOW-NEXT:    movl %esi, 12(%ecx)
-; X86-SLOW-NEXT:    movl %ebp, 8(%ecx)
+; X86-SLOW-NEXT:    movl %ebx, 8(%ecx)
 ; X86-SLOW-NEXT:    movl %edx, 4(%ecx)
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    movl %eax, (%ecx)
 ; X86-SLOW-NEXT:    movl %ecx, %eax
-; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index a464d78f9af38..df97f49440f74 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -74,43 +74,57 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-LABEL: fshl_i128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 48(%ebp), %edi
+; X86-SSE2-NEXT:    movl 52(%ebp), %eax
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $64, %cl
-; X86-SSE2-NEXT:    movl %esi, %eax
-; X86-SSE2-NEXT:    cmovnel %ebx, %eax
-; X86-SSE2-NEXT:    movl %edx, %ebp
-; X86-SSE2-NEXT:    cmovnel %edi, %ebp
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    movl %edx, %ecx
+; X86-SSE2-NEXT:    cmovnel %edi, %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%ebp), %esi
+; X86-SSE2-NEXT:    movl %esi, %ebx
+; X86-SSE2-NEXT:    cmovnel %eax, %ebx
+; X86-SSE2-NEXT:    cmovnel 44(%ebp), %eax
+; X86-SSE2-NEXT:    cmovnel 40(%ebp), %edi
+; X86-SSE2-NEXT:    cmovel 36(%ebp), %esi
+; X86-SSE2-NEXT:    cmovel 32(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
-; X86-SSE2-NEXT:    cmovnel %esi, %edx
-; X86-SSE2-NEXT:    cmovnel %ebp, %esi
-; X86-SSE2-NEXT:    cmovnel %eax, %ebp
-; X86-SSE2-NEXT:    cmovel %edi, %ebx
+; X86-SSE2-NEXT:    cmovnel %edx, %esi
+; X86-SSE2-NEXT:    cmovnel %ebx, %edx
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    cmovnel %ecx, %ebx
 ; X86-SSE2-NEXT:    cmovel %eax, %edi
-; X86-SSE2-NEXT:    movl %edi, %eax
-; X86-SSE2-NEXT:    shldl %cl, %ebx, %eax
-; X86-SSE2-NEXT:    movl %ebp, %ebx
-; X86-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-SSE2-NEXT:    movl %esi, %edi
-; X86-SSE2-NEXT:    shldl %cl, %ebp, %edi
+; X86-SSE2-NEXT:    cmovel %ecx, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
-; X86-SSE2-NEXT:    movl %edi, 8(%ecx)
-; X86-SSE2-NEXT:    movl %ebx, 4(%ecx)
-; X86-SSE2-NEXT:    movl %eax, (%ecx)
-; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SSE2-NEXT:    movl %ebx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %eax, %edi
+; X86-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %edx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    shldl %cl, %ebx, %edi
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %edx, %esi
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index 2849e448a0534..b4546c1e983c4 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -5,17 +5,20 @@
 define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: add_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    addl 40(%ebp), %esi
+; X86-NEXT:    adcl 44(%ebp), %edi
+; X86-NEXT:    adcl 48(%ebp), %ecx
+; X86-NEXT:    adcl 52(%ebp), %edx
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -24,8 +27,10 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add_i128:
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
index be8f7923b8f98..2174d5056e6ce 100644
--- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp
 
 ; Combined ABI tests for fp128 and i128
 
@@ -15,6 +15,167 @@
 ; RUN: sed 's/PrimTy/fp128/g' %s | sed 's/Prim0/0xL0/g' | llc -mtriple=i686-pc-windows-msvc -verify-machineinstrs     | FileCheck %s --check-prefix=CHECK-MSVC32
 ; RUN: sed 's/PrimTy/i128/g'  %s | sed 's/Prim0/0/g'    | llc -mtriple=i686-pc-windows-msvc -verify-machineinstrs     | FileCheck %s --check-prefix=CHECK-MSVC32
 
+define void @store(PrimTy %x, ptr %p) nounwind {
+; CHECK-X64-F128-LABEL: store:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: store:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    movq %rsi, 8(%rdx)
+; CHECK-X64-I128-NEXT:    movq %rdi, (%rdx)
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: store:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, (%rdx)
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: store:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, 8(%r8)
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, (%r8)
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: store:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, (%rdx)
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: store:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    movq %rdx, 8(%r8)
+; CHECK-MINGW-I128-NEXT:    movq %rcx, (%r8)
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: store:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 20(%esp), %ecx
+; CHECK-X86-NEXT:    movl 24(%esp), %edx
+; CHECK-X86-NEXT:    movl 28(%esp), %esi
+; CHECK-X86-NEXT:    movl 32(%esp), %edi
+; CHECK-X86-NEXT:    movl %esi, 12(%edi)
+; CHECK-X86-NEXT:    movl %edx, 8(%edi)
+; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
+; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    retl
+;
+; CHECK-MSVC32-LABEL: store:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %edi
+; CHECK-MSVC32-NEXT:    pushl %esi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %edi
+; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
+; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
+; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
+; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
+; CHECK-MSVC32-NEXT:    retl
+  store PrimTy %x, ptr %p
+  ret void
+}
+
+; Illustrate stack alignment
+define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
+; CHECK-X64-F128-LABEL: store_perturbed:
+; CHECK-X64-F128:       # %bb.0:
+; CHECK-X64-F128-NEXT:    movaps %xmm0, (%rsi)
+; CHECK-X64-F128-NEXT:    retq
+;
+; CHECK-X64-I128-LABEL: store_perturbed:
+; CHECK-X64-I128:       # %bb.0:
+; CHECK-X64-I128-NEXT:    movq %rdx, 8(%rcx)
+; CHECK-X64-I128-NEXT:    movq %rsi, (%rcx)
+; CHECK-X64-I128-NEXT:    retq
+;
+; CHECK-MSVC64-F128-LABEL: store_perturbed:
+; CHECK-MSVC64-F128:       # %bb.0:
+; CHECK-MSVC64-F128-NEXT:    movaps (%rdx), %xmm0
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, (%r8)
+; CHECK-MSVC64-F128-NEXT:    retq
+;
+; CHECK-MSVC64-I128-LABEL: store_perturbed:
+; CHECK-MSVC64-I128:       # %bb.0:
+; CHECK-MSVC64-I128-NEXT:    movq %r8, 8(%r9)
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, (%r9)
+; CHECK-MSVC64-I128-NEXT:    retq
+;
+; CHECK-MINGW-F128-LABEL: store_perturbed:
+; CHECK-MINGW-F128:       # %bb.0:
+; CHECK-MINGW-F128-NEXT:    movaps (%rdx), %xmm0
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, (%r8)
+; CHECK-MINGW-F128-NEXT:    retq
+;
+; CHECK-MINGW-I128-LABEL: store_perturbed:
+; CHECK-MINGW-I128:       # %bb.0:
+; CHECK-MINGW-I128-NEXT:    movq %r8, 8(%r9)
+; CHECK-MINGW-I128-NEXT:    movq %rdx, (%r9)
+; CHECK-MINGW-I128-NEXT:    retq
+;
+; CHECK-X86-LABEL: store_perturbed:
+; CHECK-X86:       # %bb.0:
+; CHECK-X86-NEXT:    pushl %edi
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %eax
+; CHECK-X86-NEXT:    movl 36(%esp), %ecx
+; CHECK-X86-NEXT:    movl 40(%esp), %edx
+; CHECK-X86-NEXT:    movl 44(%esp), %esi
+; CHECK-X86-NEXT:    movl 48(%esp), %edi
+; CHECK-X86-NEXT:    movl %esi, 12(%edi)
+; CHECK-X86-NEXT:    movl %edx, 8(%edi)
+; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
+; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
+; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    retl
+;
+; CHECK-MSVC32-LABEL: store_perturbed:
+; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %edi
+; CHECK-MSVC32-NEXT:    pushl %esi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 40(%ebp), %edi
+; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
+; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
+; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
+; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
+; CHECK-MSVC32-NEXT:    retl
+  store PrimTy %x, ptr %p
+  ret void
+}
+
 define PrimTy @return(ptr %p) nounwind {
 ; CHECK-X64-F128-LABEL: return:
 ; CHECK-X64-F128:       # %bb.0:
@@ -53,8 +214,8 @@ define PrimTy @return(ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-X86-NEXT:    movl 12(%esp), %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %ecx
 ; CHECK-X86-NEXT:    movl (%ecx), %edx
 ; CHECK-X86-NEXT:    movl 4(%ecx), %esi
 ; CHECK-X86-NEXT:    movl 8(%ecx), %edi
@@ -71,8 +232,8 @@ define PrimTy @return(ptr %p) nounwind {
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
 ; CHECK-MSVC32-NEXT:    movl (%ecx), %edx
 ; CHECK-MSVC32-NEXT:    movl 4(%ecx), %esi
 ; CHECK-MSVC32-NEXT:    movl 8(%ecx), %edi
@@ -123,38 +284,47 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %ecx
+; CHECK-X86-NEXT:    movl 36(%esp), %edx
+; CHECK-X86-NEXT:    movl 40(%esp), %esi
+; CHECK-X86-NEXT:    movl 44(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: first_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
 
+; Leading args such that i128 is the last possible position where it still
+; gets passed in registers.
 define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounwind {
 ; CHECK-X64-F128-LABEL: leading_args:
 ; CHECK-X64-F128:       # %bb.0:
@@ -168,64 +338,72 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounw
 ;
 ; CHECK-MSVC64-F128-LABEL: leading_args:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq 40(%rsp), %rax
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: leading_args:
 ; CHECK-MSVC64-I128:       # %bb.0:
-; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MSVC64-I128-NEXT:    movq 40(%rsp), %rax
+; CHECK-MSVC64-I128-NEXT:    movq 48(%rsp), %rdx
 ; CHECK-MSVC64-I128-NEXT:    retq
 ;
 ; CHECK-MINGW-F128-LABEL: leading_args:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq 40(%rsp), %rax
 ; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: leading_args:
 ; CHECK-MINGW-I128:       # %bb.0:
-; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MINGW-I128-NEXT:    movq 40(%rsp), %rax
+; CHECK-MINGW-I128-NEXT:    movq 48(%rsp), %rdx
 ; CHECK-MINGW-I128-NEXT:    retq
 ;
 ; CHECK-X86-LABEL: leading_args:
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
 
+; The i128 of interest must be in memory.
 define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy %_5, PrimTy %x) nounwind {
 ; CHECK-X64-F128-LABEL: many_leading_args:
 ; CHECK-X64-F128:       # %bb.0:
@@ -234,66 +412,73 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, Pr
 ;
 ; CHECK-X64-I128-LABEL: many_leading_args:
 ; CHECK-X64-I128:       # %bb.0:
-; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-X64-I128-NEXT:    movq 24(%rsp), %rax
+; CHECK-X64-I128-NEXT:    movq 32(%rsp), %rdx
 ; CHECK-X64-I128-NEXT:    retq
 ;
 ; CHECK-MSVC64-F128-LABEL: many_leading_args:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq 56(%rsp), %rax
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: many_leading_args:
 ; CHECK-MSVC64-I128:       # %bb.0:
-; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MSVC64-I128-NEXT:    movq 64(%rsp), %rax
+; CHECK-MSVC64-I128-NEXT:    movq 72(%rsp), %rdx
 ; CHECK-MSVC64-I128-NEXT:    retq
 ;
 ; CHECK-MINGW-F128-LABEL: many_leading_args:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq 56(%rsp), %rax
 ; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: many_leading_args:
 ; CHECK-MINGW-I128:       # %bb.0:
-; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MINGW-I128-NEXT:    movq 64(%rsp), %rax
+; CHECK-MINGW-I128-NEXT:    movq 72(%rsp), %rdx
 ; CHECK-MINGW-I128-NEXT:    retq
 ;
 ; CHECK-X86-LABEL: many_leading_args:
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 80(%esp), %ecx
+; CHECK-X86-NEXT:    movl 84(%esp), %edx
+; CHECK-X86-NEXT:    movl 88(%esp), %esi
+; CHECK-X86-NEXT:    movl 92(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 72(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 76(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 80(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 84(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -305,66 +490,73 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy
 ;
 ; CHECK-X64-I128-LABEL: trailing_arg:
 ; CHECK-X64-I128:       # %bb.0:
-; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-X64-I128-NEXT:    movq 8(%rsp), %rax
+; CHECK-X64-I128-NEXT:    movq 16(%rsp), %rdx
 ; CHECK-X64-I128-NEXT:    retq
 ;
 ; CHECK-MSVC64-F128-LABEL: trailing_arg:
 ; CHECK-MSVC64-F128:       # %bb.0:
-; CHECK-MSVC64-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq 48(%rsp), %rax
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rax), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    retq
 ;
 ; CHECK-MSVC64-I128-LABEL: trailing_arg:
 ; CHECK-MSVC64-I128:       # %bb.0:
-; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MSVC64-I128-NEXT:    movq 48(%rsp), %rax
+; CHECK-MSVC64-I128-NEXT:    movq 56(%rsp), %rdx
 ; CHECK-MSVC64-I128-NEXT:    retq
 ;
 ; CHECK-MINGW-F128-LABEL: trailing_arg:
 ; CHECK-MINGW-F128:       # %bb.0:
-; CHECK-MINGW-F128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq 48(%rsp), %rax
 ; CHECK-MINGW-F128-NEXT:    movaps (%rax), %xmm0
 ; CHECK-MINGW-F128-NEXT:    retq
 ;
 ; CHECK-MINGW-I128-LABEL: trailing_arg:
 ; CHECK-MINGW-I128:       # %bb.0:
-; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-I128-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-MINGW-I128-NEXT:    movq 48(%rsp), %rax
+; CHECK-MINGW-I128-NEXT:    movq 56(%rsp), %rdx
 ; CHECK-MINGW-I128-NEXT:    retq
 ;
 ; CHECK-X86-LABEL: trailing_arg:
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -388,8 +580,8 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ; CHECK-MSVC64-F128:       # %bb.0:
 ; CHECK-MSVC64-F128-NEXT:    subq $56, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 32(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 32(%rsp), %rcx
 ; CHECK-MSVC64-F128-NEXT:    callq first_arg
 ; CHECK-MSVC64-F128-NEXT:    addq $56, %rsp
 ; CHECK-MSVC64-F128-NEXT:    retq
@@ -405,8 +597,8 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ; CHECK-MINGW-F128:       # %bb.0:
 ; CHECK-MINGW-F128-NEXT:    subq $56, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 32(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 32(%rsp), %rcx
 ; CHECK-MINGW-F128-NEXT:    callq first_arg
 ; CHECK-MINGW-F128-NEXT:    addq $56, %rsp
 ; CHECK-MINGW-F128-NEXT:    retq
@@ -420,32 +612,43 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_first_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $56, %esp
+; CHECK-X86-NEXT:    movl 64(%esp), %eax
+; CHECK-X86-NEXT:    movl 68(%esp), %ecx
+; CHECK-X86-NEXT:    movl 72(%esp), %edx
+; CHECK-X86-NEXT:    movl 76(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 28(%esp)
+; CHECK-X86-NEXT:    movl %edx, 24(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 20(%esp)
+; CHECK-X86-NEXT:    movl %eax, 16(%esp)
+; CHECK-X86-NEXT:    leal 32(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
 ; CHECK-X86-NEXT:    calll first_arg@PLT
-; CHECK-X86-NEXT:    addl $56, %esp
+; CHECK-X86-NEXT:    addl $52, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_first_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $64, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 16(%esp)
+; CHECK-MSVC32-NEXT:    leal 32(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
 ; CHECK-MSVC32-NEXT:    calll _first_arg
-; CHECK-MSVC32-NEXT:    addl $20, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @first_arg(PrimTy %x)
@@ -481,9 +684,9 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ; CHECK-MSVC64-F128:       # %bb.0:
 ; CHECK-MSVC64-F128-NEXT:    subq $72, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 48(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, 32(%rsp)
 ; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
@@ -495,8 +698,8 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ; CHECK-MSVC64-I128-LABEL: call_leading_args:
 ; CHECK-MSVC64-I128:       # %bb.0:
 ; CHECK-MSVC64-I128-NEXT:    subq $56, %rsp
-; CHECK-MSVC64-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, 40(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, 32(%rsp)
 ; CHECK-MSVC64-I128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MSVC64-I128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-I128-NEXT:    xorl %r8d, %r8d
@@ -509,9 +712,9 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ; CHECK-MINGW-F128:       # %bb.0:
 ; CHECK-MINGW-F128-NEXT:    subq $72, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 48(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, 32(%rsp)
 ; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
@@ -523,8 +726,8 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ; CHECK-MINGW-I128-LABEL: call_leading_args:
 ; CHECK-MINGW-I128:       # %bb.0:
 ; CHECK-MINGW-I128-NEXT:    subq $56, %rsp
-; CHECK-MINGW-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rdx, 40(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rcx, 32(%rsp)
 ; CHECK-MINGW-I128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MINGW-I128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-I128-NEXT:    xorl %r8d, %r8d
@@ -535,48 +738,59 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll leading_args@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _leading_args
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
@@ -620,12 +834,12 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ; CHECK-MSVC64-F128-NEXT:    subq $88, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MSVC64-F128-NEXT:    xorps %xmm1, %xmm1
-; CHECK-MSVC64-F128-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm1, 64(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 48(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 64(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, 32(%rsp)
 ; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
@@ -637,10 +851,10 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ; CHECK-MSVC64-I128-LABEL: call_many_leading_args:
 ; CHECK-MSVC64-I128:       # %bb.0:
 ; CHECK-MSVC64-I128-NEXT:    subq $72, %rsp
-; CHECK-MSVC64-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, 56(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, 48(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq $0, 40(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq $0, 32(%rsp)
 ; CHECK-MSVC64-I128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MSVC64-I128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-I128-NEXT:    xorl %r8d, %r8d
@@ -654,12 +868,12 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ; CHECK-MINGW-F128-NEXT:    subq $88, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-MINGW-F128-NEXT:    xorps %xmm1, %xmm1
-; CHECK-MINGW-F128-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    movaps %xmm1, 64(%rsp)
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 48(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, 40(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 64(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, 32(%rsp)
 ; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
@@ -671,10 +885,10 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ; CHECK-MINGW-I128-LABEL: call_many_leading_args:
 ; CHECK-MINGW-I128:       # %bb.0:
 ; CHECK-MINGW-I128-NEXT:    subq $72, %rsp
-; CHECK-MINGW-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-I128-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rdx, 56(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rcx, 48(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq $0, 40(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq $0, 32(%rsp)
 ; CHECK-MINGW-I128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MINGW-I128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-I128-NEXT:    xorl %r8d, %r8d
@@ -685,56 +899,67 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_many_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $104, %esp
+; CHECK-X86-NEXT:    movl 112(%esp), %eax
+; CHECK-X86-NEXT:    movl 116(%esp), %ecx
+; CHECK-X86-NEXT:    movl 120(%esp), %edx
+; CHECK-X86-NEXT:    movl 124(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 76(%esp)
+; CHECK-X86-NEXT:    movl %edx, 72(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 68(%esp)
+; CHECK-X86-NEXT:    movl %eax, 64(%esp)
+; CHECK-X86-NEXT:    leal 80(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 60(%esp)
+; CHECK-X86-NEXT:    movl $0, 56(%esp)
+; CHECK-X86-NEXT:    movl $0, 52(%esp)
+; CHECK-X86-NEXT:    movl $0, 48(%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll many_leading_args@PLT
-; CHECK-X86-NEXT:    addl $104, %esp
+; CHECK-X86-NEXT:    addl $100, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $112, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 76(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 72(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 68(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 64(%esp)
+; CHECK-MSVC32-NEXT:    leal 80(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 48(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _many_leading_args
-; CHECK-MSVC32-NEXT:    addl $68, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x)
@@ -770,9 +995,9 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ; CHECK-MSVC64-F128:       # %bb.0:
 ; CHECK-MSVC64-F128-NEXT:    subq $72, %rsp
 ; CHECK-MSVC64-F128-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MSVC64-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-F128-NEXT:    movaps %xmm0, 48(%rsp)
+; CHECK-MSVC64-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MSVC64-F128-NEXT:    movq %rax, 32(%rsp)
 ; CHECK-MSVC64-F128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MSVC64-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-F128-NEXT:    xorl %r8d, %r8d
@@ -784,8 +1009,8 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ; CHECK-MSVC64-I128-LABEL: call_trailing_arg:
 ; CHECK-MSVC64-I128:       # %bb.0:
 ; CHECK-MSVC64-I128-NEXT:    subq $56, %rsp
-; CHECK-MSVC64-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-MSVC64-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rdx, 40(%rsp)
+; CHECK-MSVC64-I128-NEXT:    movq %rcx, 32(%rsp)
 ; CHECK-MSVC64-I128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MSVC64-I128-NEXT:    xorl %edx, %edx
 ; CHECK-MSVC64-I128-NEXT:    xorl %r8d, %r8d
@@ -798,9 +1023,9 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ; CHECK-MINGW-F128:       # %bb.0:
 ; CHECK-MINGW-F128-NEXT:    subq $72, %rsp
 ; CHECK-MINGW-F128-NEXT:    movaps (%rcx), %xmm0
-; CHECK-MINGW-F128-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-F128-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-MINGW-F128-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-F128-NEXT:    movaps %xmm0, 48(%rsp)
+; CHECK-MINGW-F128-NEXT:    leaq 48(%rsp), %rax
+; CHECK-MINGW-F128-NEXT:    movq %rax, 32(%rsp)
 ; CHECK-MINGW-F128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MINGW-F128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-F128-NEXT:    xorl %r8d, %r8d
@@ -812,8 +1037,8 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ; CHECK-MINGW-I128-LABEL: call_trailing_arg:
 ; CHECK-MINGW-I128:       # %bb.0:
 ; CHECK-MINGW-I128-NEXT:    subq $56, %rsp
-; CHECK-MINGW-I128-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-MINGW-I128-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rdx, 40(%rsp)
+; CHECK-MINGW-I128-NEXT:    movq %rcx, 32(%rsp)
 ; CHECK-MINGW-I128-NEXT:    xorl %ecx, %ecx
 ; CHECK-MINGW-I128-NEXT:    xorl %edx, %edx
 ; CHECK-MINGW-I128-NEXT:    xorl %r8d, %r8d
@@ -824,48 +1049,59 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_trailing_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll trailing_arg@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _trailing_arg
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index 717f52f198ee8..7d5757392c982 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -8,18 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl $30, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl 28(%ebp), %esi
+; X86-NEXT:    adcl 32(%ebp), %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    shrdl $2, %ecx, %edx
 ; X86-NEXT:    movl %ecx, %esi
@@ -29,8 +32,10 @@ define i128 @test1(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
@@ -52,38 +57,44 @@ define i128 @test1(i128 %x) nounwind {
 define i128 @test2(i128 %x) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shrl $30, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    shrdl $2, %edx, %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl 28(%ebp), %edx
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    shrdl $2, %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    sarl $2, %edx
-; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    sarl $2, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %ebx
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test2:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 3f890b7f2443a..9011832421326 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -8,15 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    shrdl $2, %edx, %ecx
 ; X86-NEXT:    shrl $2, %edx
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index 55c318e87a5a0..bdceeefbcfaba 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -123,31 +123,34 @@ define i64 @test_i64(i64 %a) nounwind {
 define i128 @test_i128(i128 %a) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128:
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index c52b3ed6c926d..4a6c1d0ae5deb 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -10,33 +10,39 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_lt_power_of_2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %loop
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    addl $1, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl $1, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    shrdl $28, %ebx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shrdl $28, %ebx, %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -73,15 +79,21 @@ exit:
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_eq_zero:
@@ -98,15 +110,21 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_ne_zero:
@@ -123,13 +141,19 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero:
@@ -146,13 +170,19 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_ne_zero:
@@ -170,13 +200,17 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
 ; X86-NEXT:    shldl $17, %edx, %esi
 ; X86-NEXT:    shldl $17, %ecx, %edx
 ; X86-NEXT:    shldl $17, %eax, %ecx
@@ -194,9 +228,11 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-NEXT:    calll use@PLT
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
diff --git a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
index 1f3a458ecf3a9..17065a4a61c2c 100644
--- a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
+++ b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll
@@ -1,10 +1,10 @@
 ; RUN: not llc -mtriple=x86_64-unknown-linux-gnu < %s 2>&1 | FileCheck %s
 
-; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles!
+; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles: foo, bar, baz
 
 declare void @g()
 
 define void @f(i32 %arg) {
-  call void @g() [ "foo"(i32 %arg) ]
+  call void @g() [ "foo"(i32 %arg), "bar"(i32 %arg), "baz"(i32 %arg) ]
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/invalid-operand-bundle-callbr.ll b/llvm/test/CodeGen/X86/invalid-operand-bundle-callbr.ll
index 56157d205b1c1..79bddc0755415 100644
--- a/llvm/test/CodeGen/X86/invalid-operand-bundle-callbr.ll
+++ b/llvm/test/CodeGen/X86/invalid-operand-bundle-callbr.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -mtriple=x86_64-unknown-linux-gnu < %s 2>&1 | FileCheck %s
 
-; CHECK: LLVM ERROR: cannot lower callbrs with arbitrary operand bundles!
+; CHECK: LLVM ERROR: cannot lower callbrs with arbitrary operand bundles: foo
 
 define void @f(i32 %arg) {
   callbr void asm "", ""() [ "foo"(i32 %arg) ]
diff --git a/llvm/test/CodeGen/X86/invalid-operand-bundle-invoke.ll b/llvm/test/CodeGen/X86/invalid-operand-bundle-invoke.ll
index 8091a220a44c5..1da41aeab68b9 100644
--- a/llvm/test/CodeGen/X86/invalid-operand-bundle-invoke.ll
+++ b/llvm/test/CodeGen/X86/invalid-operand-bundle-invoke.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -mtriple=x86_64-unknown-linux-gnu < %s 2>&1 | FileCheck %s
 
-; CHECK: LLVM ERROR: cannot lower invokes with arbitrary operand bundles!
+; CHECK: LLVM ERROR: cannot lower invokes with arbitrary operand bundles: foo
 
 declare void @g()
 declare i32 @__gxx_personality_v0(...)
diff --git a/llvm/test/CodeGen/X86/kcfi.ll b/llvm/test/CodeGen/X86/kcfi.ll
index 059efcc71b0eb..fd93b8e3d4188 100644
--- a/llvm/test/CodeGen/X86/kcfi.ll
+++ b/llvm/test/CodeGen/X86/kcfi.ll
@@ -138,6 +138,29 @@ define void @f8() {
   ret void
 }
 
+declare i32 @__gxx_personality_v0(...)
+
+define void @f9() personality ptr @__gxx_personality_v0 {
+; MIR-LABEL: name: f9
+; MIR: body:
+; ISEL: CALL64m killed %0, 1, $noreg, 0, $noreg, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, cfi-type 12345678
+; KCFI: $r11 = MOV64rm killed renamable $rax, 1, $noreg, 0, $noreg
+; KCFI-NEXT:  BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK $r11, 12345678, implicit-def $r10, implicit-def $r11, implicit-def $eflags
+; KCFI-NEXT:    CALL64r internal $r11, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
+; KCFI-NEXT:  }
+  %1 = load ptr, ptr @g, align 8
+  invoke void %1() [ "kcfi"(i32 12345678) ]
+    to label %cont
+    unwind label %err
+cont:
+  ret void
+err:
+  %exn = landingpad { i8*, i32 }
+          catch i8* null
+  resume { i8*, i32 } %exn
+}
+
 attributes #0 = { "target-features"="+retpoline-indirect-branches,+retpoline-indirect-calls" }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/lea-2.ll b/llvm/test/CodeGen/X86/lea-2.ll
index a48c02ff3e0b7..0883a8e726e25 100644
--- a/llvm/test/CodeGen/X86/lea-2.ll
+++ b/llvm/test/CodeGen/X86/lea-2.ll
@@ -2,7 +2,6 @@
 ; RUN: llc < %s -mtriple=i686-linux          | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -mtriple=x86_64-linux        | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-nacl         | FileCheck %s --check-prefix=X64
 
 ; The computation of %t4 should match a single lea, without using actual add instructions.
 
diff --git a/llvm/test/CodeGen/X86/lea-3.ll b/llvm/test/CodeGen/X86/lea-3.ll
index b7f1c4ae11549..2cbefc0689c11 100644
--- a/llvm/test/CodeGen/X86/lea-3.ll
+++ b/llvm/test/CodeGen/X86/lea-3.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux        | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-nacl         | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win32        | FileCheck %s --check-prefix=WIN32
 
 define i64 @test2(i64 %a) {
diff --git a/llvm/test/CodeGen/X86/lea-4.ll b/llvm/test/CodeGen/X86/lea-4.ll
index e1f0b73c33ffb..c33697e0abf3d 100644
--- a/llvm/test/CodeGen/X86/lea-4.ll
+++ b/llvm/test/CodeGen/X86/lea-4.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux        | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-nacl         | FileCheck %s
 
 define zeroext i16 @t1(i32 %on_off) nounwind {
 ; CHECK-LABEL: t1:
diff --git a/llvm/test/CodeGen/X86/lea-5.ll b/llvm/test/CodeGen/X86/lea-5.ll
index 908ec3eae7f65..39051eac45d7d 100644
--- a/llvm/test/CodeGen/X86/lea-5.ll
+++ b/llvm/test/CodeGen/X86/lea-5.ll
@@ -4,7 +4,6 @@
 
 ; RUN: llc < %s -mtriple=x86_64-linux -O2        | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O2 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-nacl -O2 | FileCheck %s -check-prefix=X32
 
 ; Function Attrs: nounwind readnone uwtable
 define void @foo(i32 %x, i32 %d) #0 {
diff --git a/llvm/test/CodeGen/X86/lea.ll b/llvm/test/CodeGen/X86/lea.ll
index 33d121f6849ba..28c66b94a69eb 100644
--- a/llvm/test/CodeGen/X86/lea.ll
+++ b/llvm/test/CodeGen/X86/lea.ll
@@ -2,7 +2,6 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=LINUX
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefixes=WIN
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefixes=LINUX
-; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s --check-prefixes=LINUX
 
 define i32 @test1(i32 %x) nounwind {
 ; LINUX-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index fc1cc1f65627a..e10e48f9aea08 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -18,85 +18,80 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    imull %edi, %eax
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl $4
   %k = mul i128 %t, %u
   ret i128 %k
diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index 961205c50d976..724b2dc4c431a 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -105,31 +105,35 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 ; X86-LABEL: neg_abs_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    subl %ebx, %ebp
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    subl %edi, %ebx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -259,37 +263,42 @@ define i64 @sub_abs_i64(i64 %x, i64 %y) nounwind {
 define i128 @sub_abs_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: sub_abs_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
 ; X86-NEXT:    subl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: sub_abs_i128:
diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll
index 672ebc1ec7275..69ae1f19f3200 100644
--- a/llvm/test/CodeGen/X86/pcsections-atomics.ll
+++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll
@@ -9,6 +9,7 @@
 ; RUN: llc -O1 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O1
 ; RUN: llc -O2 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O2
 ; RUN: llc -O3 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O3
+; RUN: llc -O3 -mcpu=haswell -mattr=cx16 < %s | FileCheck %s --check-prefixes=HASWELL-O3
 
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -50,6 +51,14 @@ define void @mixed_atomic_non_atomic(ptr %a) {
 ; O3-NEXT:    movl $1, (%rdi)
 ; O3-NEXT:    decl (%rdi)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: mixed_atomic_non_atomic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    incl (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection0:
+; HASWELL-O3-NEXT:    movl $1, (%rdi)
+; HASWELL-O3-NEXT:    decl (%rdi)
+; HASWELL-O3-NEXT:    retq
 entry:
   ; Accesses the same location atomically and non-atomically.
   %0 = load volatile i32, ptr %a, align 4
@@ -107,6 +116,17 @@ define i64 @mixed_complex_atomic_non_atomic(ptr %a, ptr %b) {
 ; O3-NEXT:    movq %rdx, (%rsi)
 ; O3-NEXT:    addq %rcx, %rax
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: mixed_complex_atomic_non_atomic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movl $1, %eax
+; HASWELL-O3-NEXT:  .Lpcsection1:
+; HASWELL-O3-NEXT:    lock xaddq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq (%rsi), %rcx
+; HASWELL-O3-NEXT:    leaq 1(%rcx), %rdx
+; HASWELL-O3-NEXT:    movq %rdx, (%rsi)
+; HASWELL-O3-NEXT:    addq %rcx, %rax
+; HASWELL-O3-NEXT:    retq
 entry:
   %0 = atomicrmw add ptr %a, i64 1 monotonic, align 8, !pcsections !0
   %1 = load i64, ptr %b, align 8
@@ -148,6 +168,14 @@ define i8 @atomic8_load_unordered(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection2:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a unordered, align 1, !pcsections !0
@@ -187,6 +215,14 @@ define i8 @atomic8_load_monotonic(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection3:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a monotonic, align 1, !pcsections !0
@@ -226,6 +262,14 @@ define i8 @atomic8_load_acquire(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection4:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a acquire, align 1, !pcsections !0
@@ -265,6 +309,14 @@ define i8 @atomic8_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movzbl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection5:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i8, ptr %a seq_cst, align 1, !pcsections !0
@@ -304,6 +356,14 @@ define void @atomic8_store_unordered(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection6:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a unordered, align 1, !pcsections !0
@@ -343,6 +403,14 @@ define void @atomic8_store_monotonic(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection7:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a monotonic, align 1, !pcsections !0
@@ -382,6 +450,14 @@ define void @atomic8_store_release(ptr %a) {
 ; O3-NEXT:    movb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection8:
+; HASWELL-O3-NEXT:    movb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a release, align 1, !pcsections !0
@@ -425,6 +501,15 @@ define void @atomic8_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection9:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i8 42, ptr %a seq_cst, align 1, !pcsections !0
@@ -468,6 +553,15 @@ define void @atomic8_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection10:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -507,6 +601,14 @@ define void @atomic8_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection11:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -546,6 +648,14 @@ define void @atomic8_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection12:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -585,6 +695,14 @@ define void @atomic8_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection13:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -624,6 +742,14 @@ define void @atomic8_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection14:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -663,6 +789,14 @@ define void @atomic8_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection15:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -763,6 +897,27 @@ define void @atomic8_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection16:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB16_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection17:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection18:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection19:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection20:
+; HASWELL-O3-NEXT:    jne .LBB16_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 monotonic, align 1, !pcsections !0
@@ -806,6 +961,15 @@ define void @atomic8_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection21:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -845,6 +1009,14 @@ define void @atomic8_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection22:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -884,6 +1056,14 @@ define void @atomic8_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection23:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -923,6 +1103,14 @@ define void @atomic8_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection24:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -962,6 +1150,14 @@ define void @atomic8_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection25:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1001,6 +1197,14 @@ define void @atomic8_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection26:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1101,6 +1305,27 @@ define void @atomic8_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection27:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB23_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection28:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection29:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection30:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection31:
+; HASWELL-O3-NEXT:    jne .LBB23_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 acquire, align 1, !pcsections !0
@@ -1144,6 +1369,15 @@ define void @atomic8_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection32:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1183,6 +1417,14 @@ define void @atomic8_add_release(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection33:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1222,6 +1464,14 @@ define void @atomic8_sub_release(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection34:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1261,6 +1511,14 @@ define void @atomic8_and_release(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection35:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1300,6 +1558,14 @@ define void @atomic8_or_release(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection36:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1339,6 +1605,14 @@ define void @atomic8_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection37:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1439,6 +1713,27 @@ define void @atomic8_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection38:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB30_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection39:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection40:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection41:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection42:
+; HASWELL-O3-NEXT:    jne .LBB30_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 release, align 1, !pcsections !0
@@ -1482,6 +1777,15 @@ define void @atomic8_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection43:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1521,6 +1825,14 @@ define void @atomic8_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection44:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1560,6 +1872,14 @@ define void @atomic8_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection45:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1599,6 +1919,14 @@ define void @atomic8_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection46:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1638,6 +1966,14 @@ define void @atomic8_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection47:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1677,6 +2013,14 @@ define void @atomic8_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection48:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1777,6 +2121,27 @@ define void @atomic8_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection49:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB37_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection50:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection51:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection52:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection53:
+; HASWELL-O3-NEXT:    jne .LBB37_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 acq_rel, align 1, !pcsections !0
@@ -1820,6 +2185,15 @@ define void @atomic8_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgb %al, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection54:
+; HASWELL-O3-NEXT:    xchgb %al, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1859,6 +2233,14 @@ define void @atomic8_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection55:
+; HASWELL-O3-NEXT:    lock addb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1898,6 +2280,14 @@ define void @atomic8_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection56:
+; HASWELL-O3-NEXT:    lock subb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1937,6 +2327,14 @@ define void @atomic8_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection57:
+; HASWELL-O3-NEXT:    lock andb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -1976,6 +2374,14 @@ define void @atomic8_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection58:
+; HASWELL-O3-NEXT:    lock orb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2015,6 +2421,14 @@ define void @atomic8_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorb $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection59:
+; HASWELL-O3-NEXT:    lock xorb $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2115,6 +2529,27 @@ define void @atomic8_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection60:
+; HASWELL-O3-NEXT:    movzbl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB44_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection61:
+; HASWELL-O3-NEXT:    notb %cl
+; HASWELL-O3-NEXT:  .Lpcsection62:
+; HASWELL-O3-NEXT:    orb $-43, %cl
+; HASWELL-O3-NEXT:  .Lpcsection63:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection64:
+; HASWELL-O3-NEXT:    jne .LBB44_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i8 42 seq_cst, align 1, !pcsections !0
@@ -2200,6 +2635,25 @@ define void @atomic8_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection65:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection66:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection67:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection68:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection69:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection70:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 monotonic monotonic, align 1, !pcsections !0
@@ -2287,6 +2741,25 @@ define void @atomic8_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection71:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection72:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection73:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection74:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection75:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection76:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 acquire monotonic, align 1, !pcsections !0
@@ -2374,6 +2847,25 @@ define void @atomic8_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection77:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection78:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection79:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection80:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection81:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection82:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 release monotonic, align 1, !pcsections !0
@@ -2461,6 +2953,25 @@ define void @atomic8_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection83:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection84:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection85:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection86:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection87:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection88:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 acq_rel monotonic, align 1, !pcsections !0
@@ -2548,6 +3059,25 @@ define void @atomic8_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic8_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movb $1, %cl
+; HASWELL-O3-NEXT:  .Lpcsection89:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection90:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection91:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection92:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection93:
+; HASWELL-O3-NEXT:    movb $42, %al
+; HASWELL-O3-NEXT:  .Lpcsection94:
+; HASWELL-O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i8 42, i8 1 seq_cst monotonic, align 1, !pcsections !0
@@ -2589,6 +3119,14 @@ define i16 @atomic16_load_unordered(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection95:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a unordered, align 2, !pcsections !0
@@ -2628,6 +3166,14 @@ define i16 @atomic16_load_monotonic(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection96:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a monotonic, align 2, !pcsections !0
@@ -2667,6 +3213,14 @@ define i16 @atomic16_load_acquire(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection97:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a acquire, align 2, !pcsections !0
@@ -2706,6 +3260,14 @@ define i16 @atomic16_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection98:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i16, ptr %a seq_cst, align 2, !pcsections !0
@@ -2745,6 +3307,14 @@ define void @atomic16_store_unordered(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection99:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a unordered, align 2, !pcsections !0
@@ -2784,6 +3354,14 @@ define void @atomic16_store_monotonic(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection100:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a monotonic, align 2, !pcsections !0
@@ -2823,6 +3401,14 @@ define void @atomic16_store_release(ptr %a) {
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection101:
+; HASWELL-O3-NEXT:    movw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a release, align 2, !pcsections !0
@@ -2866,6 +3452,15 @@ define void @atomic16_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection102:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i16 42, ptr %a seq_cst, align 2, !pcsections !0
@@ -2909,6 +3504,15 @@ define void @atomic16_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection103:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -2948,6 +3552,14 @@ define void @atomic16_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection104:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -2987,6 +3599,14 @@ define void @atomic16_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection105:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3026,6 +3646,14 @@ define void @atomic16_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection106:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3065,6 +3693,14 @@ define void @atomic16_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection107:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3104,6 +3740,14 @@ define void @atomic16_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection108:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3220,6 +3864,31 @@ define void @atomic16_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection109:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB64_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection110:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection111:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection112:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection113:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection114:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection115:
+; HASWELL-O3-NEXT:    jne .LBB64_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 monotonic, align 2, !pcsections !0
@@ -3263,6 +3932,15 @@ define void @atomic16_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection116:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3302,6 +3980,14 @@ define void @atomic16_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection117:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3341,6 +4027,14 @@ define void @atomic16_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection118:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3380,6 +4074,14 @@ define void @atomic16_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection119:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3419,6 +4121,14 @@ define void @atomic16_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection120:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3458,6 +4168,14 @@ define void @atomic16_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection121:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3574,6 +4292,31 @@ define void @atomic16_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection122:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB71_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection123:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection124:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection125:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection126:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection127:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection128:
+; HASWELL-O3-NEXT:    jne .LBB71_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 acquire, align 2, !pcsections !0
@@ -3617,6 +4360,15 @@ define void @atomic16_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection129:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3656,6 +4408,14 @@ define void @atomic16_add_release(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection130:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3695,6 +4455,14 @@ define void @atomic16_sub_release(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection131:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3734,6 +4502,14 @@ define void @atomic16_and_release(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection132:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3773,6 +4549,14 @@ define void @atomic16_or_release(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection133:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3812,6 +4596,14 @@ define void @atomic16_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection134:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3928,6 +4720,31 @@ define void @atomic16_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection135:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB78_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection136:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection137:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection138:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection139:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection140:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection141:
+; HASWELL-O3-NEXT:    jne .LBB78_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 release, align 2, !pcsections !0
@@ -3971,6 +4788,15 @@ define void @atomic16_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection142:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4010,6 +4836,14 @@ define void @atomic16_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection143:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4049,6 +4883,14 @@ define void @atomic16_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection144:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4088,6 +4930,14 @@ define void @atomic16_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection145:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4127,6 +4977,14 @@ define void @atomic16_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection146:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4166,6 +5024,14 @@ define void @atomic16_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection147:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4282,6 +5148,31 @@ define void @atomic16_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection148:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB85_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection149:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection150:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection151:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection152:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection153:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection154:
+; HASWELL-O3-NEXT:    jne .LBB85_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 acq_rel, align 2, !pcsections !0
@@ -4325,6 +5216,15 @@ define void @atomic16_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection155:
+; HASWELL-O3-NEXT:    xchgw %ax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4364,6 +5264,14 @@ define void @atomic16_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection156:
+; HASWELL-O3-NEXT:    lock addw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4403,6 +5311,14 @@ define void @atomic16_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection157:
+; HASWELL-O3-NEXT:    lock subw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4442,6 +5358,14 @@ define void @atomic16_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection158:
+; HASWELL-O3-NEXT:    lock andw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4481,6 +5405,14 @@ define void @atomic16_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection159:
+; HASWELL-O3-NEXT:    lock orw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4520,6 +5452,14 @@ define void @atomic16_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection160:
+; HASWELL-O3-NEXT:    lock xorw $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4636,6 +5576,31 @@ define void @atomic16_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection161:
+; HASWELL-O3-NEXT:    movzwl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB92_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection162:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection163:
+; HASWELL-O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
+; HASWELL-O3-NEXT:  .Lpcsection164:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax killed $eax
+; HASWELL-O3-NEXT:  .Lpcsection165:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection166:
+; HASWELL-O3-NEXT:    # kill: def $ax killed $ax def $eax
+; HASWELL-O3-NEXT:  .Lpcsection167:
+; HASWELL-O3-NEXT:    jne .LBB92_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i16 42 seq_cst, align 2, !pcsections !0
@@ -4712,6 +5677,22 @@ define void @atomic16_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection168:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection169:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection170:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 monotonic monotonic, align 2, !pcsections !0
@@ -4790,6 +5771,22 @@ define void @atomic16_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection171:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection172:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection173:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 acquire monotonic, align 2, !pcsections !0
@@ -4868,6 +5865,22 @@ define void @atomic16_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection174:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection175:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection176:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 release monotonic, align 2, !pcsections !0
@@ -4946,6 +5959,22 @@ define void @atomic16_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection177:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection178:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection179:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 acq_rel monotonic, align 2, !pcsections !0
@@ -5024,6 +6053,22 @@ define void @atomic16_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic16_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movw $1, %cx
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection180:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection181:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movw $42, %ax
+; HASWELL-O3-NEXT:  .Lpcsection182:
+; HASWELL-O3-NEXT:    lock cmpxchgw %cx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i16 42, i16 1 seq_cst monotonic, align 2, !pcsections !0
@@ -5065,6 +6110,14 @@ define i32 @atomic32_load_unordered(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection183:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a unordered, align 4, !pcsections !0
@@ -5104,6 +6157,14 @@ define i32 @atomic32_load_monotonic(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection184:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a monotonic, align 4, !pcsections !0
@@ -5143,6 +6204,14 @@ define i32 @atomic32_load_acquire(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection185:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a acquire, align 4, !pcsections !0
@@ -5182,6 +6251,14 @@ define i32 @atomic32_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection186:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i32, ptr %a seq_cst, align 4, !pcsections !0
@@ -5221,6 +6298,14 @@ define void @atomic32_store_unordered(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection187:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a unordered, align 4, !pcsections !0
@@ -5260,6 +6345,14 @@ define void @atomic32_store_monotonic(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection188:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a monotonic, align 4, !pcsections !0
@@ -5299,6 +6392,14 @@ define void @atomic32_store_release(ptr %a) {
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection189:
+; HASWELL-O3-NEXT:    movl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a release, align 4, !pcsections !0
@@ -5342,6 +6443,15 @@ define void @atomic32_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection190:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i32 42, ptr %a seq_cst, align 4, !pcsections !0
@@ -5385,6 +6495,15 @@ define void @atomic32_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection191:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5424,6 +6543,14 @@ define void @atomic32_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection192:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5463,6 +6590,14 @@ define void @atomic32_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection193:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5502,6 +6637,14 @@ define void @atomic32_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection194:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5541,6 +6684,14 @@ define void @atomic32_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection195:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5580,6 +6731,14 @@ define void @atomic32_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection196:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5680,6 +6839,27 @@ define void @atomic32_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection197:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB112_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection198:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection199:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection200:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection201:
+; HASWELL-O3-NEXT:    jne .LBB112_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 monotonic, align 4, !pcsections !0
@@ -5723,6 +6903,15 @@ define void @atomic32_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection202:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5762,6 +6951,14 @@ define void @atomic32_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection203:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5801,6 +6998,14 @@ define void @atomic32_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection204:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5840,6 +7045,14 @@ define void @atomic32_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection205:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5879,6 +7092,14 @@ define void @atomic32_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection206:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -5918,6 +7139,14 @@ define void @atomic32_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection207:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -6018,6 +7247,27 @@ define void @atomic32_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection208:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB119_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection209:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection210:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection211:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection212:
+; HASWELL-O3-NEXT:    jne .LBB119_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 acquire, align 4, !pcsections !0
@@ -6061,6 +7311,15 @@ define void @atomic32_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection213:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6100,6 +7359,14 @@ define void @atomic32_add_release(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection214:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6139,6 +7406,14 @@ define void @atomic32_sub_release(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection215:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6178,6 +7453,14 @@ define void @atomic32_and_release(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection216:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6217,6 +7500,14 @@ define void @atomic32_or_release(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection217:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6256,6 +7547,14 @@ define void @atomic32_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection218:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6356,6 +7655,27 @@ define void @atomic32_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection219:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB126_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection220:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection221:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection222:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection223:
+; HASWELL-O3-NEXT:    jne .LBB126_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 release, align 4, !pcsections !0
@@ -6399,6 +7719,15 @@ define void @atomic32_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection224:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6438,6 +7767,14 @@ define void @atomic32_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection225:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6477,6 +7814,14 @@ define void @atomic32_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection226:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6516,6 +7861,14 @@ define void @atomic32_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection227:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6555,6 +7908,14 @@ define void @atomic32_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection228:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6594,6 +7955,14 @@ define void @atomic32_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection229:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6694,6 +8063,27 @@ define void @atomic32_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection230:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB133_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection231:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection232:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection233:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection234:
+; HASWELL-O3-NEXT:    jne .LBB133_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 acq_rel, align 4, !pcsections !0
@@ -6737,6 +8127,15 @@ define void @atomic32_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection235:
+; HASWELL-O3-NEXT:    xchgl %eax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6776,6 +8175,14 @@ define void @atomic32_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection236:
+; HASWELL-O3-NEXT:    lock addl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6815,6 +8222,14 @@ define void @atomic32_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection237:
+; HASWELL-O3-NEXT:    lock subl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6854,6 +8269,14 @@ define void @atomic32_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection238:
+; HASWELL-O3-NEXT:    lock andl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6893,6 +8316,14 @@ define void @atomic32_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection239:
+; HASWELL-O3-NEXT:    lock orl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -6932,6 +8363,14 @@ define void @atomic32_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection240:
+; HASWELL-O3-NEXT:    lock xorl $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -7032,6 +8471,27 @@ define void @atomic32_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection241:
+; HASWELL-O3-NEXT:    movl (%rdi), %eax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB140_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection242:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection243:
+; HASWELL-O3-NEXT:    orl $-43, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection244:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection245:
+; HASWELL-O3-NEXT:    jne .LBB140_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i32 42 seq_cst, align 4, !pcsections !0
@@ -7117,6 +8577,25 @@ define void @atomic32_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection246:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection247:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection248:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection249:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection250:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection251:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 monotonic monotonic, align 4, !pcsections !0
@@ -7204,6 +8683,25 @@ define void @atomic32_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection252:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection253:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection254:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection255:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection256:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection257:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 acquire monotonic, align 4, !pcsections !0
@@ -7291,6 +8789,25 @@ define void @atomic32_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection258:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection259:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection260:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection261:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection262:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection263:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 release monotonic, align 4, !pcsections !0
@@ -7378,6 +8895,25 @@ define void @atomic32_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection264:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection265:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection266:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection267:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection268:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection269:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 acq_rel monotonic, align 4, !pcsections !0
@@ -7465,6 +9001,25 @@ define void @atomic32_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic32_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection270:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection271:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection272:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection273:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection274:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection275:
+; HASWELL-O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i32 42, i32 1 seq_cst monotonic, align 4, !pcsections !0
@@ -7506,6 +9061,14 @@ define i64 @atomic64_load_unordered(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection276:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a unordered, align 8, !pcsections !0
@@ -7545,6 +9108,14 @@ define i64 @atomic64_load_monotonic(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection277:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a monotonic, align 8, !pcsections !0
@@ -7584,6 +9155,14 @@ define i64 @atomic64_load_acquire(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection278:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a acquire, align 8, !pcsections !0
@@ -7623,6 +9202,14 @@ define i64 @atomic64_load_seq_cst(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection279:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i64, ptr %a seq_cst, align 8, !pcsections !0
@@ -7662,6 +9249,14 @@ define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_load_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection280:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic ptr, ptr %a seq_cst, align 8, !pcsections !0
@@ -7701,6 +9296,14 @@ define void @atomic64_store_unordered(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection281:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a unordered, align 8, !pcsections !0
@@ -7740,6 +9343,14 @@ define void @atomic64_store_monotonic(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection282:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a monotonic, align 8, !pcsections !0
@@ -7779,6 +9390,14 @@ define void @atomic64_store_release(ptr %a) {
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection283:
+; HASWELL-O3-NEXT:    movq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a release, align 8, !pcsections !0
@@ -7822,6 +9441,15 @@ define void @atomic64_store_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection284:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i64 42, ptr %a seq_cst, align 8, !pcsections !0
@@ -7861,6 +9489,14 @@ define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_store_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection285:
+; HASWELL-O3-NEXT:    xchgq %rsi, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic ptr %v, ptr %a seq_cst, align 8, !pcsections !0
@@ -7904,6 +9540,15 @@ define void @atomic64_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection286:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -7943,6 +9588,14 @@ define void @atomic64_add_monotonic(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection287:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -7982,6 +9635,14 @@ define void @atomic64_sub_monotonic(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection288:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8021,6 +9682,14 @@ define void @atomic64_and_monotonic(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection289:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8060,6 +9729,14 @@ define void @atomic64_or_monotonic(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection290:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8099,6 +9776,14 @@ define void @atomic64_xor_monotonic(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection291:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8202,6 +9887,27 @@ define void @atomic64_nand_monotonic(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection292:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB162_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection293:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection294:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection295:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection296:
+; HASWELL-O3-NEXT:    jne .LBB162_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 monotonic, align 8, !pcsections !0
@@ -8245,6 +9951,15 @@ define void @atomic64_xchg_acquire(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection297:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8284,6 +9999,14 @@ define void @atomic64_add_acquire(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection298:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8323,6 +10046,14 @@ define void @atomic64_sub_acquire(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection299:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8362,6 +10093,14 @@ define void @atomic64_and_acquire(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection300:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8401,6 +10140,14 @@ define void @atomic64_or_acquire(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection301:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8440,6 +10187,14 @@ define void @atomic64_xor_acquire(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection302:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8543,6 +10298,27 @@ define void @atomic64_nand_acquire(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection303:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB169_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection304:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection305:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection306:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection307:
+; HASWELL-O3-NEXT:    jne .LBB169_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 acquire, align 8, !pcsections !0
@@ -8586,6 +10362,15 @@ define void @atomic64_xchg_release(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection308:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8625,6 +10410,14 @@ define void @atomic64_add_release(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection309:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8664,6 +10457,14 @@ define void @atomic64_sub_release(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection310:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8703,6 +10504,14 @@ define void @atomic64_and_release(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection311:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8742,6 +10551,14 @@ define void @atomic64_or_release(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection312:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8781,6 +10598,14 @@ define void @atomic64_xor_release(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection313:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8884,6 +10709,27 @@ define void @atomic64_nand_release(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection314:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB176_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection315:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection316:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection317:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection318:
+; HASWELL-O3-NEXT:    jne .LBB176_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 release, align 8, !pcsections !0
@@ -8927,6 +10773,15 @@ define void @atomic64_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection319:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -8966,6 +10821,14 @@ define void @atomic64_add_acq_rel(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection320:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9005,6 +10868,14 @@ define void @atomic64_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection321:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9044,6 +10915,14 @@ define void @atomic64_and_acq_rel(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection322:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9083,6 +10962,14 @@ define void @atomic64_or_acq_rel(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection323:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9122,6 +11009,14 @@ define void @atomic64_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection324:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9225,6 +11120,27 @@ define void @atomic64_nand_acq_rel(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection325:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB183_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection326:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection327:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection328:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection329:
+; HASWELL-O3-NEXT:    jne .LBB183_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 acq_rel, align 8, !pcsections !0
@@ -9268,6 +11184,15 @@ define void @atomic64_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection330:
+; HASWELL-O3-NEXT:    xchgq %rax, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9307,6 +11232,14 @@ define void @atomic64_add_seq_cst(ptr %a) {
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection331:
+; HASWELL-O3-NEXT:    lock addq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9346,6 +11279,14 @@ define void @atomic64_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection332:
+; HASWELL-O3-NEXT:    lock subq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9385,6 +11326,14 @@ define void @atomic64_and_seq_cst(ptr %a) {
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection333:
+; HASWELL-O3-NEXT:    lock andq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9424,6 +11373,14 @@ define void @atomic64_or_seq_cst(ptr %a) {
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection334:
+; HASWELL-O3-NEXT:    lock orq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9463,6 +11420,14 @@ define void @atomic64_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection335:
+; HASWELL-O3-NEXT:    lock xorq $42, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9566,6 +11531,27 @@ define void @atomic64_nand_seq_cst(ptr %a) {
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection336:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB190_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection337:
+; HASWELL-O3-NEXT:    notl %ecx
+; HASWELL-O3-NEXT:  .Lpcsection338:
+; HASWELL-O3-NEXT:    orq $-43, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection339:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection340:
+; HASWELL-O3-NEXT:    jne .LBB190_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i64 42 seq_cst, align 8, !pcsections !0
@@ -9651,6 +11637,25 @@ define void @atomic64_cas_monotonic(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection341:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection342:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection343:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection344:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection345:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection346:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 monotonic monotonic, align 8, !pcsections !0
@@ -9738,6 +11743,25 @@ define void @atomic64_cas_acquire(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection347:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection348:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection349:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection350:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection351:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection352:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 acquire monotonic, align 8, !pcsections !0
@@ -9825,6 +11849,25 @@ define void @atomic64_cas_release(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection353:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection354:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection355:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection356:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection357:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection358:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 release monotonic, align 8, !pcsections !0
@@ -9912,6 +11955,25 @@ define void @atomic64_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection359:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection360:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection361:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection362:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection363:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection364:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 acq_rel monotonic, align 8, !pcsections !0
@@ -9999,6 +12061,25 @@ define void @atomic64_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $3, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:    movl $1, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection365:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection366:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection367:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection368:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection369:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection370:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; HASWELL-O3-NEXT:    movq $3, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i64 42, i64 1 seq_cst monotonic, align 8, !pcsections !0
@@ -10044,6 +12125,15 @@ define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) {
 ; O3-NEXT:    lock cmpxchgq %rdx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic64_cas_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq %rsi, %rax
+; HASWELL-O3-NEXT:    movq foo(%rip), %rcx
+; HASWELL-O3-NEXT:  .Lpcsection371:
+; HASWELL-O3-NEXT:    lock cmpxchgq %rdx, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, ptr %v1, ptr %v2 seq_cst seq_cst, align 8, !pcsections !0
@@ -10102,6 +12192,18 @@ define i64 @atomic_use_cond(ptr %a) {
 ; O3-NEXT:  .LBB197_2: # %else
 ; O3-NEXT:    movl $2, %eax
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic_use_cond:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:  .Lpcsection372:
+; HASWELL-O3-NEXT:    lock decq (%rdi)
+; HASWELL-O3-NEXT:    jne .LBB197_2
+; HASWELL-O3-NEXT:  # %bb.1: # %then
+; HASWELL-O3-NEXT:    movl $1, %eax
+; HASWELL-O3-NEXT:    retq
+; HASWELL-O3-NEXT:  .LBB197_2: # %else
+; HASWELL-O3-NEXT:    movl $2, %eax
+; HASWELL-O3-NEXT:    retq
 entry:
   %x = atomicrmw sub ptr %a, i64 1 seq_cst, align 8, !pcsections !0
   %y = icmp eq i64 %x, 1
@@ -10196,6 +12298,18 @@ define i128 @atomic128_load_unordered(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection373:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection374:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection375:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a unordered, align 16, !pcsections !0
@@ -10285,6 +12399,18 @@ define i128 @atomic128_load_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection376:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection377:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection378:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a monotonic, align 16, !pcsections !0
@@ -10374,6 +12500,18 @@ define i128 @atomic128_load_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection379:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection380:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection381:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a acquire, align 16, !pcsections !0
@@ -10463,6 +12601,18 @@ define i128 @atomic128_load_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection382:
+; HASWELL-O3-NEXT:    vmovdqa (%rdi), %xmm0
+; HASWELL-O3-NEXT:  .Lpcsection383:
+; HASWELL-O3-NEXT:    vmovq %xmm0, %rax
+; HASWELL-O3-NEXT:  .Lpcsection384:
+; HASWELL-O3-NEXT:    vpextrq $1, %xmm0, %rdx
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic i128, ptr %a seq_cst, align 16, !pcsections !0
@@ -10502,6 +12652,14 @@ define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_load_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection385:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = load atomic ptr, ptr %a seq_cst, align 16, !pcsections !0
@@ -10629,6 +12787,16 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_unordered:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection386:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection387:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a unordered, align 16, !pcsections !0
@@ -10756,6 +12924,16 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection388:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection389:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a monotonic, align 16, !pcsections !0
@@ -10883,6 +13061,16 @@ define void @atomic128_store_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection390:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection391:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a release, align 16, !pcsections !0
@@ -11010,6 +13198,18 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection392:
+; HASWELL-O3-NEXT:    vmovss {{.*#+}} xmm0 = [42,0,0,0]
+; HASWELL-O3-NEXT:  .Lpcsection393:
+; HASWELL-O3-NEXT:    vmovaps %xmm0, (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection394:
+; HASWELL-O3-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic i128 42, ptr %a seq_cst, align 16, !pcsections !0
@@ -11049,6 +13249,14 @@ define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_store_seq_cst_ptr_ty:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection395:
+; HASWELL-O3-NEXT:    xchgq %rsi, (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   store atomic ptr %v, ptr %a seq_cst, align 16, !pcsections !0
@@ -11176,6 +13384,33 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection396:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection397:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection398:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB208_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection399:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection400:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection401:
+; HASWELL-O3-NEXT:    jne .LBB208_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11309,6 +13544,35 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection402:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection403:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB209_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection404:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection405:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection406:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection407:
+; HASWELL-O3-NEXT:    jne .LBB209_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11442,6 +13706,35 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection408:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection409:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB210_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection410:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection411:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection412:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection413:
+; HASWELL-O3-NEXT:    jne .LBB210_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11574,6 +13867,34 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection414:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection415:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB211_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection416:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection417:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection418:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection419:
+; HASWELL-O3-NEXT:    jne .LBB211_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11699,6 +14020,33 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection420:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection421:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB212_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection422:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection423:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection424:
+; HASWELL-O3-NEXT:    jne .LBB212_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11824,6 +14172,33 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection425:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection426:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB213_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection427:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection428:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection429:
+; HASWELL-O3-NEXT:    jne .LBB213_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -11964,6 +14339,36 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection430:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection431:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection432:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB214_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection433:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection434:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection435:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection436:
+; HASWELL-O3-NEXT:    jne .LBB214_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 monotonic, align 16, !pcsections !0
@@ -12091,6 +14496,33 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection437:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection438:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection439:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB215_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection440:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection441:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection442:
+; HASWELL-O3-NEXT:    jne .LBB215_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12224,6 +14656,35 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection443:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection444:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB216_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection445:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection446:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection447:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection448:
+; HASWELL-O3-NEXT:    jne .LBB216_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12357,6 +14818,35 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection449:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection450:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB217_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection451:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection452:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection453:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection454:
+; HASWELL-O3-NEXT:    jne .LBB217_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12489,6 +14979,34 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection455:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection456:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB218_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection457:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection458:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection459:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection460:
+; HASWELL-O3-NEXT:    jne .LBB218_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12614,6 +15132,33 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection461:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection462:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB219_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection463:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection464:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection465:
+; HASWELL-O3-NEXT:    jne .LBB219_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12739,6 +15284,33 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection466:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection467:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB220_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection468:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection469:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection470:
+; HASWELL-O3-NEXT:    jne .LBB220_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -12879,6 +15451,36 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection471:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection472:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection473:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB221_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection474:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection475:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection476:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection477:
+; HASWELL-O3-NEXT:    jne .LBB221_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 acquire, align 16, !pcsections !0
@@ -13006,6 +15608,33 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_release:
+; HASWELL-O3:       # %bb.0:
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection478:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection479:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection480:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB222_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection481:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection482:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection483:
+; HASWELL-O3-NEXT:    jne .LBB222_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 release, align 16, !pcsections !0
   store volatile i64 1, ptr @foo, align 8
@@ -13138,6 +15767,35 @@ define void @atomic128_add_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection484:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection485:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB223_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection486:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection487:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection488:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection489:
+; HASWELL-O3-NEXT:    jne .LBB223_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13271,6 +15929,35 @@ define void @atomic128_sub_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection490:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection491:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB224_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection492:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection493:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection494:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection495:
+; HASWELL-O3-NEXT:    jne .LBB224_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13403,6 +16090,34 @@ define void @atomic128_and_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection496:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection497:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB225_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection498:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection499:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection500:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection501:
+; HASWELL-O3-NEXT:    jne .LBB225_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13528,6 +16243,33 @@ define void @atomic128_or_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection502:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection503:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB226_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection504:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection505:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection506:
+; HASWELL-O3-NEXT:    jne .LBB226_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13653,6 +16395,33 @@ define void @atomic128_xor_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection507:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection508:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB227_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection509:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection510:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection511:
+; HASWELL-O3-NEXT:    jne .LBB227_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13793,6 +16562,36 @@ define void @atomic128_nand_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection512:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection513:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection514:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB228_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection515:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection516:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection517:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection518:
+; HASWELL-O3-NEXT:    jne .LBB228_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 release, align 16, !pcsections !0
@@ -13920,6 +16719,33 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection519:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection520:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection521:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB229_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection522:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection523:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection524:
+; HASWELL-O3-NEXT:    jne .LBB229_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14053,6 +16879,35 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection525:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection526:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB230_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection527:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection528:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection529:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection530:
+; HASWELL-O3-NEXT:    jne .LBB230_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14186,6 +17041,35 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection531:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection532:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB231_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection533:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection534:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection535:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection536:
+; HASWELL-O3-NEXT:    jne .LBB231_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14318,6 +17202,34 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection537:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection538:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB232_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection539:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection540:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection541:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection542:
+; HASWELL-O3-NEXT:    jne .LBB232_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14443,6 +17355,33 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection543:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection544:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB233_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection545:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection546:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection547:
+; HASWELL-O3-NEXT:    jne .LBB233_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14568,6 +17507,33 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection548:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection549:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB234_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection550:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection551:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection552:
+; HASWELL-O3-NEXT:    jne .LBB234_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14708,6 +17674,36 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection553:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection554:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection555:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB235_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection556:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection557:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection558:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection559:
+; HASWELL-O3-NEXT:    jne .LBB235_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 acq_rel, align 16, !pcsections !0
@@ -14835,6 +17831,33 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xchg_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection560:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection561:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection562:
+; HASWELL-O3-NEXT:    movl $42, %ebx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB236_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:  .Lpcsection563:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection564:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection565:
+; HASWELL-O3-NEXT:    jne .LBB236_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xchg ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -14968,6 +17991,35 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_add_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection566:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection567:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB237_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection568:
+; HASWELL-O3-NEXT:    addq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection569:
+; HASWELL-O3-NEXT:    adcq $0, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection570:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection571:
+; HASWELL-O3-NEXT:    jne .LBB237_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw add ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15101,6 +18153,35 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_sub_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection572:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection573:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB238_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection574:
+; HASWELL-O3-NEXT:    addq $-42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection575:
+; HASWELL-O3-NEXT:    adcq $-1, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection576:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection577:
+; HASWELL-O3-NEXT:    jne .LBB238_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw sub ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15233,6 +18314,34 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_and_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection578:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection579:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB239_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection580:
+; HASWELL-O3-NEXT:    andl $42, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection581:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection582:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection583:
+; HASWELL-O3-NEXT:    jne .LBB239_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw and ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15358,6 +18467,33 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_or_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection584:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection585:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB240_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection586:
+; HASWELL-O3-NEXT:    orq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection587:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection588:
+; HASWELL-O3-NEXT:    jne .LBB240_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw or ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15483,6 +18619,33 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_xor_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection589:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection590:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB241_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movq %rax, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection591:
+; HASWELL-O3-NEXT:    xorq $42, %rbx
+; HASWELL-O3-NEXT:    movq %rdx, %rcx
+; HASWELL-O3-NEXT:  .Lpcsection592:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection593:
+; HASWELL-O3-NEXT:    jne .LBB241_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw xor ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15623,6 +18786,36 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_nand_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection594:
+; HASWELL-O3-NEXT:    movq (%rdi), %rax
+; HASWELL-O3-NEXT:  .Lpcsection595:
+; HASWELL-O3-NEXT:    movq 8(%rdi), %rdx
+; HASWELL-O3-NEXT:  .Lpcsection596:
+; HASWELL-O3-NEXT:    movq $-1, %rcx
+; HASWELL-O3-NEXT:    .p2align 4
+; HASWELL-O3-NEXT:  .LBB242_1: # %atomicrmw.start
+; HASWELL-O3-NEXT:    # =>This Inner Loop Header: Depth=1
+; HASWELL-O3-NEXT:    movl %eax, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection597:
+; HASWELL-O3-NEXT:    notl %ebx
+; HASWELL-O3-NEXT:  .Lpcsection598:
+; HASWELL-O3-NEXT:    orq $-43, %rbx
+; HASWELL-O3-NEXT:  .Lpcsection599:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection600:
+; HASWELL-O3-NEXT:    jne .LBB242_1
+; HASWELL-O3-NEXT:  # %bb.2: # %atomicrmw.end
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = atomicrmw nand ptr %a, i128 42 seq_cst, align 16, !pcsections !0
@@ -15781,6 +18974,43 @@ define void @atomic128_cas_monotonic(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_monotonic:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection601:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection602:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection603:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection604:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection605:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection606:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection607:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection608:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection609:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection610:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection611:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection612:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection613:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 monotonic monotonic, align 16, !pcsections !0
@@ -15941,6 +19171,43 @@ define void @atomic128_cas_acquire(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_acquire:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection614:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection615:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection616:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection617:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection618:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection619:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection620:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection621:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection622:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection623:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection624:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection625:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection626:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 acquire monotonic, align 16, !pcsections !0
@@ -16101,6 +19368,43 @@ define void @atomic128_cas_release(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_release:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection627:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection628:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection629:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection630:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection631:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection632:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection633:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection634:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection635:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection636:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection637:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection638:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection639:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 release monotonic, align 16, !pcsections !0
@@ -16261,6 +19565,43 @@ define void @atomic128_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_acq_rel:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection640:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection641:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection642:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection643:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection644:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection645:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection646:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection647:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection648:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection649:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection650:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection651:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection652:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $1, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 acq_rel monotonic, align 16, !pcsections !0
@@ -16421,6 +19762,43 @@ define void @atomic128_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    popq %rbx
 ; O3-NEXT:    .cfi_def_cfa_offset 8
 ; O3-NEXT:    retq
+;
+; HASWELL-O3-LABEL: atomic128_cas_seq_cst:
+; HASWELL-O3:       # %bb.0: # %entry
+; HASWELL-O3-NEXT:    pushq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 16
+; HASWELL-O3-NEXT:    .cfi_offset %rbx, -16
+; HASWELL-O3-NEXT:    movq foo(%rip), %rax
+; HASWELL-O3-NEXT:  .Lpcsection653:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection654:
+; HASWELL-O3-NEXT:    movl $1, %ebx
+; HASWELL-O3-NEXT:  .Lpcsection655:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection656:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection657:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection658:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection659:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection660:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection661:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:  .Lpcsection662:
+; HASWELL-O3-NEXT:    movl $42, %eax
+; HASWELL-O3-NEXT:  .Lpcsection663:
+; HASWELL-O3-NEXT:    xorl %edx, %edx
+; HASWELL-O3-NEXT:  .Lpcsection664:
+; HASWELL-O3-NEXT:    xorl %ecx, %ecx
+; HASWELL-O3-NEXT:  .Lpcsection665:
+; HASWELL-O3-NEXT:    lock cmpxchg16b (%rdi)
+; HASWELL-O3-NEXT:    movq $3, foo(%rip)
+; HASWELL-O3-NEXT:    popq %rbx
+; HASWELL-O3-NEXT:    .cfi_def_cfa_offset 8
+; HASWELL-O3-NEXT:    retq
 entry:
   load volatile i64, ptr @foo, align 8
   %x = cmpxchg ptr %a, i128 42, i128 1 seq_cst monotonic, align 16, !pcsections !0
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 35c7c0e09f394..3004b8b72fcc5 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -340,84 +340,87 @@ define i64 @cnt64(i64 %x) nounwind readnone {
 define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-LABEL: cnt128:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 4(%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128:
@@ -462,20 +465,26 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128:
@@ -522,7 +531,11 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-SSE2-LABEL: cnt128:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -564,11 +577,17 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    movl $0, 12(%eax)
 ; X86-SSE2-NEXT:    movl $0, 8(%eax)
 ; X86-SSE2-NEXT:    movl $0, 4(%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -600,6 +619,8 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    movl $0, 12(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 8(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 4(%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -928,87 +949,92 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-LABEL: cnt128_optsize:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl 32(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %eax, %esi
 ; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %esi, %edi
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %eax, %esi
+; X86-NOSSE-NEXT:    subl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %edx, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edx, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    andl %edx, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    movl 28(%ebp), %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    subl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %eax
 ; X86-NOSSE-NEXT:    andl %ecx, %eax
 ; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    andl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    leal -12(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1057,13 +1083,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1071,7 +1101,9 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_optsize:
@@ -1118,7 +1150,11 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-SSE2-LABEL: cnt128_optsize:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1161,11 +1197,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_optsize:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1198,6 +1240,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -1415,85 +1459,88 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-LABEL: cnt128_pgso:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128_pgso:
@@ -1538,13 +1585,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1552,7 +1603,9 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_pgso:
@@ -1599,7 +1652,11 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-SSE2-LABEL: cnt128_pgso:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1642,11 +1699,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_pgso:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1679,6 +1742,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/X86/pr46004.ll b/llvm/test/CodeGen/X86/pr46004.ll
index f7c7da089c365..829d6dfceba3d 100644
--- a/llvm/test/CodeGen/X86/pr46004.ll
+++ b/llvm/test/CodeGen/X86/pr46004.ll
@@ -6,7 +6,17 @@
 define void @fuzz22357(i128 %a0) {
 ; X86-LABEL: fuzz22357:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22357:
@@ -24,6 +34,15 @@ define void @fuzz22357(i128 %a0) {
 define void @fuzz22723(i128 %a0) {
 ; X86-LABEL: fuzz22723:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22723:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
index 50a967e1c2a1a..ce9723b3a84bc 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
@@ -762,11 +762,15 @@ define i32 @x_to_s32(x86_fp80 %a) nounwind {
 define i32 @t_to_u32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u32:
@@ -797,12 +801,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u32:
@@ -835,12 +845,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u32:
@@ -860,11 +876,15 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 define i32 @t_to_s32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s32:
@@ -895,12 +915,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s32:
@@ -933,12 +959,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s32:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index f516db8b30ffe..3287869f2c601 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -1417,11 +1417,15 @@ define i64 @x_to_s64(x86_fp80 %a) nounwind {
 define i64 @t_to_u64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u64:
@@ -1452,12 +1456,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u64:
@@ -1490,12 +1500,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u64:
@@ -1515,11 +1531,15 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 define i64 @t_to_s64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s64:
@@ -1550,12 +1570,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s64:
@@ -1588,12 +1614,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s64:
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 874913629e9e3..8a287229a1cb1 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -118,30 +118,33 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: scmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    cmpl %ecx, 8(%ebp)
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    cmpl 8(%ebp), %esi
+; X86-NEXT:    sbbl 12(%ebp), %eax
+; X86-NEXT:    sbbl 16(%ebp), %edi
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 4925f8bc6c8b0..392bc83d9d5d8 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -307,69 +307,70 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    subl $112, %esp
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets %al
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %cl
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %bl
+; X86-NEXT:    xorb %al, %bl
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    testb %bl, %al
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab6178..7df490f984928 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -370,67 +370,68 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %eax, %edi
-; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %ebx
+; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %esi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %edi
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl $0, %ebx
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %dl
-; X86-NEXT:    xorb %al, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -438,41 +439,38 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %edi
 ; X86-NEXT:    sbbl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %edi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    cmovgel %ecx, %ebx
+; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovgel %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    cmovgel %ecx, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
+; X86-NEXT:    cmovgel %eax, %edi
 ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-NEXT:    cmovgel %eax, %edx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -805,137 +803,155 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $208, %esp
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    movl 16(%ebp), %ebx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NEXT:    shrl $31, %ebx
 ; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%edi,%edi), %eax
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    leal (%ecx,%ecx), %eax
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%esi,%esi), %eax
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 28(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
@@ -949,18 +965,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %bl
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %bl
-; X86-NEXT:    testb %bh, %bl
+; X86-NEXT:    setne %bh
+; X86-NEXT:    testb %bl, %bh
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -1107,36 +1123,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    sets %ah
-; X86-NEXT:    xorb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1144,38 +1148,38 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    cmpl $-1, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
-; X86-NEXT:    cmovgel %eax, %ecx
 ; X86-NEXT:    cmovgel %eax, %edi
+; X86-NEXT:    cmovgel %eax, %ecx
+; X86-NEXT:    cmovgel %eax, %esi
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    cmovgel %eax, %ebx
-; X86-NEXT:    cmovgel %edx, %edi
-; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    shldl $31, %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 76cb4e87bae18..dfeef48897e06 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -792,14 +792,24 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
 define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) {
 ; X86-LABEL: combineShiftOfShiftedLogic:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %eax, 20(%ecx)
 ; X86-NEXT:    movl $0, 16(%ecx)
 ; X86-NEXT:    movl $0, 12(%ecx)
 ; X86-NEXT:    movl $0, 8(%ecx)
 ; X86-NEXT:    movl $0, 4(%ecx)
 ; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combineShiftOfShiftedLogic:
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 767bd772ab7a3..9323cd5b1917f 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -212,9 +212,18 @@ entry:
 }
 
 define void @test_lshr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_lshr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_lshr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_lshr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = lshr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -222,9 +231,18 @@ entry:
 }
 
 define void @test_ashr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_ashr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_ashr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_ashr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = ashr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -232,9 +250,18 @@ entry:
 }
 
 define void @test_shl_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_shl_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_shl_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_shl_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = shl i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -874,26 +901,31 @@ define <2 x i256> @shl_zext_lshr_outofrange(<2 x i128> %a0) {
 define i128 @lshr_shl_mask(i128 %a0) {
 ; i686-LABEL: lshr_shl_mask:
 ; i686:       # %bb.0:
-; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    .cfi_def_cfa_offset 8
+; i686-NEXT:    .cfi_offset %ebp, -8
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    .cfi_def_cfa_register %ebp
+; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 12
-; i686-NEXT:    .cfi_offset %esi, -12
-; i686-NEXT:    .cfi_offset %edi, -8
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    .cfi_offset %esi, -16
+; i686-NEXT:    .cfi_offset %edi, -12
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %esi
 ; i686-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
-; i686-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    andl 36(%ebp), %edi
 ; i686-NEXT:    movl %edi, 12(%eax)
 ; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edx, 4(%eax)
 ; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 8
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    .cfi_def_cfa_offset 4
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    .cfi_def_cfa %esp, 4
 ; i686-NEXT:    retl $4
 ;
 ; x86_64-LABEL: lshr_shl_mask:
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 86891e964d96d..509d4443e930a 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -151,31 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ebx
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -717,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index 8907f6c4cd598..5e9fe27b41d2c 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -151,32 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovll %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ecx
+; X86-NEXT:    cmovll 28(%ebp), %edx
+; X86-NEXT:    cmovll 32(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -718,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/stack-align2.ll b/llvm/test/CodeGen/X86/stack-align2.ll
index 99f36d2ca8b7a..095a9090ed08f 100644
--- a/llvm/test/CodeGen/X86/stack-align2.ll
+++ b/llvm/test/CodeGen/X86/stack-align2.ll
@@ -3,13 +3,11 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=i386-netbsd | FileCheck %s -check-prefix=NETBSD-I386
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-I386
 ; RUN: llc < %s -mcpu=generic -mtriple=i386-pc-solaris2.11 | FileCheck %s -check-prefix=SOLARIS-I386
-; RUN: llc < %s -mcpu=generic -mtriple=i386-nacl | FileCheck %s -check-prefix=NACL-I386
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=LINUX-X86_64
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-kfreebsd | FileCheck %s -check-prefix=KFREEBSD-X86_64
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-netbsd | FileCheck %s -check-prefix=NETBSD-X86_64
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-X86_64
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-solaris2.11 | FileCheck %s -check-prefix=SOLARIS-X86_64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-nacl | FileCheck %s -check-prefix=NACL-X86_64
 
 define i32 @test() nounwind {
 entry:
@@ -19,7 +17,6 @@ entry:
 ; LINUX-I386:     subl	$12, %esp
 ; KFREEBSD-I386:  subl	$12, %esp
 ; DARWIN-I386:    subl	$12, %esp
-; NACL-I386:      subl	$12, %esp
 ; NETBSD-I386-NOT:  subl	{{.*}}, %esp
 ; SOLARIS-I386-NOT: subl	{{.*}}, %esp
 
@@ -27,8 +24,6 @@ entry:
 ; LINUX-X86_64-NOT:  subq	{{.*}}, %rsp
 ; DARWIN-X86_64:     pushq %{{.*}}
 ; DARWIN-X86_64-NOT: subq	{{.*}}, %rsp
-; NACL-X86_64:       pushq %{{.*}}
-; NACL-X86_64-NOT:   subq	{{.*}}, %rsp
 ; NETBSD-X86_64:     pushq %{{.*}}
 ; NETBSD-X86_64-NOT: subq	{{.*}}, %rsp
 ; SOLARIS-X86_64:     pushq %{{.*}}
diff --git a/llvm/test/CodeGen/X86/stack-protector-target-openbsd.ll b/llvm/test/CodeGen/X86/stack-protector-target-openbsd.ll
new file mode 100644
index 0000000000000..06382c6bbbbed
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-protector-target-openbsd.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386--linux < %s | FileCheck -check-prefix=LINUX32 %s
+; RUN: llc -mtriple=x86_64--linux < %s | FileCheck -check-prefix=LINUX64 %s
+; RUN: llc -mtriple=i386--openbsd < %s | FileCheck -check-prefix=OPENBSD32 %s
+; RUN: llc -mtriple=x86_64--openbsd < %s | FileCheck -check-prefix=OPENBSD64 %s
+
+define void @func() sspreq nounwind {
+; LINUX32-LABEL: func:
+; LINUX32:       # %bb.0:
+; LINUX32-NEXT:    subl $12, %esp
+; LINUX32-NEXT:    movl %gs:20, %eax
+; LINUX32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; LINUX32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; LINUX32-NEXT:    movl %eax, (%esp)
+; LINUX32-NEXT:    calll capture@PLT
+; LINUX32-NEXT:    movl %gs:20, %eax
+; LINUX32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; LINUX32-NEXT:    jne .LBB0_2
+; LINUX32-NEXT:  # %bb.1: # %SP_return
+; LINUX32-NEXT:    addl $12, %esp
+; LINUX32-NEXT:    retl
+; LINUX32-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; LINUX32-NEXT:    calll __stack_chk_fail@PLT
+;
+; LINUX64-LABEL: func:
+; LINUX64:       # %bb.0:
+; LINUX64-NEXT:    subq $24, %rsp
+; LINUX64-NEXT:    movq %fs:40, %rax
+; LINUX64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; LINUX64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; LINUX64-NEXT:    callq capture@PLT
+; LINUX64-NEXT:    movq %fs:40, %rax
+; LINUX64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; LINUX64-NEXT:    jne .LBB0_2
+; LINUX64-NEXT:  # %bb.1: # %SP_return
+; LINUX64-NEXT:    addq $24, %rsp
+; LINUX64-NEXT:    retq
+; LINUX64-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; LINUX64-NEXT:    callq __stack_chk_fail@PLT
+;
+; OPENBSD32-LABEL: func:
+; OPENBSD32:       # %bb.0:
+; OPENBSD32-NEXT:    subl $8, %esp
+; OPENBSD32-NEXT:    movl __guard_local, %eax
+; OPENBSD32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; OPENBSD32-NEXT:    movl %esp, %eax
+; OPENBSD32-NEXT:    pushl %eax
+; OPENBSD32-NEXT:    calll capture@PLT
+; OPENBSD32-NEXT:    addl $4, %esp
+; OPENBSD32-NEXT:    movl __guard_local, %eax
+; OPENBSD32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; OPENBSD32-NEXT:    jne .LBB0_2
+; OPENBSD32-NEXT:  # %bb.1: # %SP_return
+; OPENBSD32-NEXT:    addl $8, %esp
+; OPENBSD32-NEXT:    retl
+; OPENBSD32-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; OPENBSD32-NEXT:    pushl $.LSSH
+; OPENBSD32-NEXT:    calll __stack_smash_handler@PLT
+;
+; OPENBSD64-LABEL: func:
+; OPENBSD64:       # %bb.0:
+; OPENBSD64-NEXT:    subq $24, %rsp
+; OPENBSD64-NEXT:    movq __guard_local(%rip), %rax
+; OPENBSD64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; OPENBSD64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; OPENBSD64-NEXT:    callq capture@PLT
+; OPENBSD64-NEXT:    movq __guard_local(%rip), %rax
+; OPENBSD64-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; OPENBSD64-NEXT:    jne .LBB0_2
+; OPENBSD64-NEXT:  # %bb.1: # %SP_return
+; OPENBSD64-NEXT:    addq $24, %rsp
+; OPENBSD64-NEXT:    retq
+; OPENBSD64-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
+; OPENBSD64-NEXT:    movl $.LSSH, %edi
+; OPENBSD64-NEXT:    callq __stack_smash_handler@PLT
+  %alloca = alloca i32, align 4
+  call void @capture(ptr %alloca)
+  ret void
+}
+
+declare void @capture(ptr)
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index 953a0d65c5386..ab28a3b4a2b63 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -48,18 +48,17 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
 ; CHECK-X86:       ## %bb.0:
 ; CHECK-X86-NEXT:    subl $12, %esp
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; CHECK-X86-NEXT:    cmpb $123, {{[0-9]+}}(%esp)
-; CHECK-X86-NEXT:    setne %cl
-; CHECK-X86-NEXT:    testl $263, %eax ## imm = 0x107
-; CHECK-X86-NEXT:    setne %al
-; CHECK-X86-NEXT:    testb %cl, %al
-; CHECK-X86-NEXT:    jne LBB1_2
-; CHECK-X86-NEXT:  ## %bb.1: ## %yes
-; CHECK-X86-NEXT:    addl $12, %esp
-; CHECK-X86-NEXT:    retl
-; CHECK-X86-NEXT:  LBB1_2: ## %no
+; CHECK-X86-NEXT:    sete %al
+; CHECK-X86-NEXT:    testl $263, %ecx ## imm = 0x107
+; CHECK-X86-NEXT:    je LBB1_3
+; CHECK-X86-NEXT:  ## %bb.1:
+; CHECK-X86-NEXT:    testb %al, %al
+; CHECK-X86-NEXT:    jne LBB1_3
+; CHECK-X86-NEXT:  ## %bb.2: ## %no
 ; CHECK-X86-NEXT:    calll _bar
+; CHECK-X86-NEXT:  LBB1_3: ## %yes
 ; CHECK-X86-NEXT:    addl $12, %esp
 ; CHECK-X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 6a52acfe2fb30..7f17299b39e33 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -107,29 +107,33 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: ucmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    cmpl %eax, 24(%ebp)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 5b1e0545502b8..82dfeeee13293 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -153,26 +153,28 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 30a7f80b2315d..3da5973f9f903 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -194,32 +194,34 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:  .LBB4_2:
 ; X86-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index f589d4a7b04a9..7ef859978cdbf 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -232,31 +232,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -282,37 +285,40 @@ define i128 @test_i128_1(i128 %a) nounwind {
 ; X86-LABEL: test_i128_1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    cmovel %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl $1, %ebp
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %ebx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    cmovel %edi, %ebp
-; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    cmpl $0, 28(%ebp)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl $1, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    cmovel %esi, %ebx
+; X86-NEXT:    cmovel 28(%ebp), %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1312,29 +1318,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index 7a5cdbb9ce758..c927abf3a4263 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -147,32 +147,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovbl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ecx
+; X86-NEXT:    cmovbl 28(%ebp), %edx
+; X86-NEXT:    cmovbl 32(%ebp), %esi
+; X86-NEXT:    cmovbl %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -727,29 +729,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4c3170304b980..89afd1b00444b 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -38,8 +38,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 44
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 48
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -147,7 +147,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/unreachable-mbb-undef-phi.mir b/llvm/test/CodeGen/X86/unreachable-mbb-undef-phi.mir
index 1bdbc53862667..232a5e3353b26 100644
--- a/llvm/test/CodeGen/X86/unreachable-mbb-undef-phi.mir
+++ b/llvm/test/CodeGen/X86/unreachable-mbb-undef-phi.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=x86_64-- %s -o - -run-pass=processimpdefs -run-pass=unreachable-mbb-elimination | FileCheck %s
+# RUN: llc -mtriple=x86_64-- %s -o - -passes=process-imp-defs,unreachable-mbb-elimination | FileCheck %s
 ---
 name:            f
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
index a15d633d85381..12dccca76eb19 100644
--- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
@@ -92,6 +92,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %esi, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -101,15 +103,15 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    jge .LBB4_2
+; CHECK-NEXT:    jge .LBB4_3
 ; CHECK-NEXT:  # %bb.1: # %bb1
 ; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB4_2: # %bb2
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:  .LBB4_3: # %bb2
 ; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:  .LBB4_2: # %bb1
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
new file mode 100644
index 0000000000000..5ac90a0af2e57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck -check-prefix=CHECK64 %s
+
+define i64 @test_sdiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_sdiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __alldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_sdiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    retq
+  %ret = sdiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_srem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_srem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __allrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_srem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = srem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_udiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_udiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aulldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_udiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    retq
+  %ret = udiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_urem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_urem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aullrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_urem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = urem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_mul_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_mul_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:    mull %esi
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    addl %ecx, %edx
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    addl %esi, %edx
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_mul_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    imulq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = mul i64 %a, %b
+  ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/X86/x86-64-baseptr.ll b/llvm/test/CodeGen/X86/x86-64-baseptr.ll
index 020004def6e7a..62c63d5defe60 100644
--- a/llvm/test/CodeGen/X86/x86-64-baseptr.ll
+++ b/llvm/test/CodeGen/X86/x86-64-baseptr.ll
@@ -2,14 +2,6 @@
 ; RUN: llc -mtriple=x86_64-pc-linux -stackrealign -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -stackrealign -verify-machineinstrs < %s | FileCheck -check-prefix=X32ABI %s
 
-; This should run with NaCl as well ( -mtriple=x86_64-pc-nacl ) but currently doesn't due to PR22655
-
-; Make sure the correct register gets set up as the base pointer
-; This should be rbx for x64 and 64-bit NaCl and ebx for x32
-; NACL-LABEL: base
-; NACL: subq $32, %rsp
-; NACL: movq %rsp, %rbx
-
 declare i32 @helper() nounwind
 define void @base() #0 {
 ; CHECK-LABEL: base:
diff --git a/llvm/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/llvm/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
index bceebdc9ad7d2..26be80ea58949 100644
--- a/llvm/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
+++ b/llvm/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-linux < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
 
 ; x32 uses %esp, %ebp as stack and frame pointers
 
@@ -14,12 +13,6 @@
 ; X32ABI: movl %esp, %ebp
 ; X32ABI: movl %edi, -4(%ebp)
 ; X32ABI: popq %rbp
-; NACL-LABEL: foo
-; NACL: pushq %rbp
-; NACL: movq %rsp, %rbp
-; NACL: movl %edi, -4(%rbp)
-; NACL: popq %rbp
-
 
 define void @foo(ptr %a) #0 {
 entry:
@@ -30,5 +23,3 @@ entry:
 }
 
 attributes #0 = { nounwind uwtable "frame-pointer"="all"}
-
-
diff --git a/llvm/test/CodeGen/XCore/llvm.frexp.ll b/llvm/test/CodeGen/XCore/llvm.frexp.ll
new file mode 100644
index 0000000000000..5d63d234717d9
--- /dev/null
+++ b/llvm/test/CodeGen/XCore/llvm.frexp.ll
@@ -0,0 +1,85 @@
+; RUN: llc -mtriple=xcore-unknown-unknown < %s | FileCheck %s
+
+define { half, i32 } @test_frexp_f16_i32(half %a) nounwind {
+; CHECK-LABEL: test_frexp_f16_i32:
+; CHECK: bl __extendhfsf2
+; CHECK: bl frexpf
+; CHECK: ldw r{{[0-9]+}}, sp[1]
+; CHECK: bl __truncsfhf2
+%result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
+  ret { half, i32 } %result
+}
+
+define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f16_v2i32:
+; CHECK: bl frexpf
+; CHECK: bl frexpf
+  %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
+  ret { <2 x half>, <2 x i32> } %result
+}
+
+define { float, i32 } @test_frexp_f32_i32(float %a) nounwind {
+; CHECK-LABEL: test_frexp_f32_i32:
+; CHECK: bl frexpf
+  %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  ret { float, i32 } %result
+}
+
+define { float, i32 } @test_frexp_f32_i32_tailcall(float %a) nounwind {
+; CHECK-LABEL: test_frexp_f32_i32_tailcall:
+; CHECK: bl frexpf
+  %result = tail call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  ret { float, i32 } %result
+}
+
+define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f32_v2i32:
+; CHECK: bl frexpf
+; CHECK: bl frexpf
+  %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
+  ret { <2 x float>, <2 x i32> } %result
+}
+
+define { double, i32 } @test_frexp_f64_i32(double %a) nounwind {
+; CHECK-LABEL: test_frexp_f64_i32:
+; CHECK: bl frexp
+  %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
+  ret { double, i32 } %result
+}
+
+define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2f64_v2i32:
+; CHECK: bl frexp
+; CHECK: bl frexp
+  %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
+  ret { <2 x double>, <2 x i32> } %result
+}
+
+define { fp128, i32 } @test_frexp_fp128_i32(fp128 %a) nounwind {
+; CHECK-LABEL: test_frexp_fp128_i32:
+; CHECK: bl frexpl
+  %result = call { fp128, i32 } @llvm.frexp.fp128.i32(fp128 %a)
+  ret { fp128, i32 } %result
+}
+
+define { <2 x fp128>, <2 x i32> } @test_frexp_v2fp128_v2i32(<2 x fp128> %a) nounwind {
+; CHECK-LABEL: test_frexp_v2fp128_v2i32:
+; CHECK: bl frexpl
+; CHECK: bl frexpl
+  %result = call { <2 x fp128>, <2 x i32> } @llvm.frexp.v2fp128.v2i32(<2 x fp128> %a)
+  ret { <2 x fp128>, <2 x i32> } %result
+}
+
+declare { half, i32 } @llvm.frexp.f16.i32(half) #0
+declare { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half>) #0
+
+declare { float, i32 } @llvm.frexp.f32.i32(float) #0
+declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>) #0
+
+declare { double, i32 } @llvm.frexp.f64.i32(double) #0
+declare { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double>) #0
+
+declare { fp128, i32 } @llvm.frexp.fp128.i32(fp128) #0
+declare { <2 x fp128>, <2 x i32> } @llvm.frexp.v2fp128.v2i32(<2 x fp128>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
index 0f8f505c51a58..5d73b2669ccda 100644
--- a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
@@ -7,6 +7,8 @@
 
 define dso_local void @f() !dbg !10 {
 entry:
+; Include non-key location to check verifier is checking the whole function.
+  %0 = add i32 0, 0, !dbg !14
   ret void, !dbg !13
 }
 
@@ -20,3 +22,4 @@ entry:
 !11 = !DISubroutineType(types: !12)
 !12 = !{null}
 !13 = !DILocation(line: 1, column: 11, scope: !10, atomGroup: 1, atomRank: 1)
+!14 = !DILocation(line: 1, column: 11, scope: !10)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
index 5bf529d7d32df..44545685b5121 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
@@ -948,19 +948,20 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[A1:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[A1]] to <2 x i1>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <2 x i64> [[TMP1]] to <2 x double>
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0]], <2 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[RES]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[TMP2]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A2:%.*]], <2 x i64> [[A1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A3:%.*]], <2 x i64> [[A2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES1]]
 ;
@@ -973,19 +974,20 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
 define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[A1:%.*]] to <4 x i2>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[A1]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i2> [[TMP2]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A2:%.*]], <4 x i64> [[A1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A3:%.*]], <4 x i64> [[A2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES1]]
 ;
@@ -1012,19 +1014,20 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
 define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i2>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i32> [[A1]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP1]] to <4 x float>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i2> [[TMP2]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A2:%.*]], <4 x i32> [[A1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A3:%.*]], <4 x i32> [[A2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES1]]
 ;
@@ -1047,7 +1050,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[A2]] to <4 x i2>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[_MSLD]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
@@ -1072,19 +1075,20 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
 define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[A1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[A1]] to <8 x i3>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0]], <8 x i32> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0]], <8 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i3> [[TMP2]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A2:%.*]], <8 x i32> [[A1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A3:%.*]], <8 x i32> [[A2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES1]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
new file mode 100644
index 0000000000000..e5e4371c525b2
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
@@ -0,0 +1,670 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -S -mattr=+avx512vl,+gfni,+avx512bw -passes=msan 2>&1 | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.vgf2p8affineinvqb.128
+; - llvm.x86.vgf2p8affineinvqb.256
+; - llvm.x86.vgf2p8affineinvqb.512
+; - llvm.x86.vgf2p8affineqb.128
+; - llvm.x86.vgf2p8affineqb.256
+; - llvm.x86.vgf2p8affineqb.512
+;
+; Heuristically handled:
+; - llvm.x86.vgf2p8mulb.128
+; - llvm.x86.vgf2p8mulb.256
+; - llvm.x86.vgf2p8mulb.512
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineinvqb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP36]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
+  %3 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4)
+  %4 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+  %5 = select <16 x i1> %1, <16 x i8> %3, <16 x i8> zeroinitializer
+  %6 = select <16 x i1> %1, <16 x i8> %4, <16 x i8> %passthru
+  %7 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %2, 0
+  %8 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %7, <16 x i8> %5, 1
+  %9 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %8, <16 x i8> %6, 2
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %9
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8)
+define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineinvqb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP36]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
+  %3 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4)
+  %4 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 5)
+  %5 = select <32 x i1> %1, <32 x i8> %3, <32 x i8> zeroinitializer
+  %6 = select <32 x i1> %1, <32 x i8> %4, <32 x i8> %passthru
+  %7 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %2, 0
+  %8 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %7, <32 x i8> %5, 1
+  %9 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %8, <32 x i8> %6, 2
+  ret { <32 x i8>, <32 x i8>, <32 x i8> } %9
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8>, <64 x i8>, i8)
+define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineinvqb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP36]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
+  %3 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4)
+  %4 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5)
+  %5 = select <64 x i1> %1, <64 x i8> %3, <64 x i8> zeroinitializer
+  %6 = select <64 x i1> %1, <64 x i8> %4, <64 x i8> %passthru
+  %7 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> %2, 0
+  %8 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %7, <64 x i8> %5, 1
+  %9 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %8, <64 x i8> %6, 2
+  ret { <64 x i8>, <64 x i8>, <64 x i8> } %9
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineqb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP36]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
+  %3 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4)
+  %4 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5)
+  %5 = select <16 x i1> %1, <16 x i8> %3, <16 x i8> zeroinitializer
+  %6 = select <16 x i1> %1, <16 x i8> %4, <16 x i8> %passthru
+  %7 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %2, 0
+  %8 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %7, <16 x i8> %5, 1
+  %9 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> }    %8, <16 x i8> %6, 2
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %9
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineqb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP36]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
+  %3 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4)
+  %4 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 5)
+  %5 = select <32 x i1> %1, <32 x i8> %3, <32 x i8> zeroinitializer
+  %6 = select <32 x i1> %1, <32 x i8> %4, <32 x i8> %passthru
+  %7 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %2, 0
+  %8 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %7, <32 x i8> %5, 1
+  %9 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> }    %8, <32 x i8> %6, 2
+  ret { <32 x i8>, <32 x i8>, <32 x i8> } %9
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8affineqb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2
+; CHECK-NEXT:    store { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP36]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
+  %3 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4)
+  %4 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5)
+  %5 = select <64 x i1> %1, <64 x i8> %3, <64 x i8> zeroinitializer
+  %6 = select <64 x i1> %1, <64 x i8> %4, <64 x i8> %passthru
+  %7 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> %2, 0
+  %8 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %7, <64 x i8> %5, 1
+  %9 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> }    %8, <64 x i8> %6, 2
+  ret { <64 x i8>, <64 x i8>, <64 x i8> } %9
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
+define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[_MSPROP]], <16 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i8> [[TMP7]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i8> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP11]], <16 x i8> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP7]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP12]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+  %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i16 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_128_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[_MSPROP]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i8> [[TMP10]], <16 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> zeroinitializer
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[TMP11]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+  %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>)
+define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[_MSPROP]], <32 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <32 x i8> [[TMP7]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <32 x i8> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP11]], <32 x i8> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP7]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[TMP12]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+  %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i32 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_256_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[_MSPROP]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <32 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <32 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> [[TMP10]], <32 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> zeroinitializer
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[TMP11]]
+;
+  %1 = bitcast i32 %mask to <32 x i1>
+  %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+  %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8>, <64 x i8>)
+define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[_MSPROP]], <64 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <64 x i8> [[TMP7]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i8> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <64 x i8> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP11]], <64 x i8> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP7]], <64 x i8> [[PASSTHRU]]
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP12]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
+  %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i64 %mask) #0 {
+; CHECK-LABEL: @test_vgf2p8mulb_512_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[_MSPROP]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <64 x i8> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <64 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> [[TMP10]], <64 x i8> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP6]], <64 x i8> zeroinitializer
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[TMP11]]
+;
+  %1 = bitcast i64 %mask to <64 x i1>
+  %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
+  %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
+}
+
+attributes #0 = { sanitize_memory }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index b292a8a9b1d66..74cb49b0f602a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -8141,19 +8141,20 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i3> [[TMP2]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X3:%.*]], <8 x i64> [[X2]])
 ; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP5]]
 ;
@@ -8165,21 +8166,22 @@ define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X4:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i3> [[TMP5]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X4:%.*]], <8 x i64> [[X1]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X5:%.*]], <8 x i64> [[X4]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP6]], <8 x i64> [[TMP4]]
@@ -8201,20 +8203,21 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i3> [[TMP4]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X4:%.*]], <8 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
@@ -8236,19 +8239,20 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i4> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X3:%.*]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP5]]
 ;
@@ -8260,21 +8264,22 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X4:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i4> [[TMP5]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X4:%.*]], <16 x i32> [[X1]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X5:%.*]], <16 x i32> [[X4]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP6]], <16 x i32> [[TMP4]]
@@ -8297,20 +8302,21 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i4> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X4:%.*]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
@@ -13713,28 +13719,29 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK:       6:
 ; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1]], <16 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X3:%.*]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i4> [[TMP14]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP11]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       13:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X4:%.*]])
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X3]], <16 x i32> [[X4:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
 ;
@@ -13748,8 +13755,8 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -13762,8 +13769,8 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i4> [[TMP18]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP19]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
@@ -13796,10 +13803,10 @@ define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP8]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i3> [[TMP6]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP12]], 0
@@ -13825,10 +13832,10 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1]], <8 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i3> [[TMP7]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP21]], 0
@@ -13864,10 +13871,10 @@ define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i4> [[TMP6]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP12]], 0
@@ -13893,10 +13900,10 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i4> [[TMP7]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP21]], 0
@@ -13930,17 +13937,18 @@ define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X3:%.*]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i3> [[TMP8]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X3]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
 ;
@@ -13953,11 +13961,11 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i3> [[TMP13]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -13988,29 +13996,30 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK:       6:
 ; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X4:%.*]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i4> [[TMP18]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP19]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
-; CHECK:       12:
+; CHECK:       13:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X4]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -14035,14 +14044,15 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK:       7:
 ; CHECK-NEXT:    [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
@@ -14052,19 +14062,19 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP5]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i64> [[X0]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP11]], <8 x i64> [[X0]], <8 x double> [[TMP24]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP11]], <8 x i64> [[X4:%.*]], <8 x double> [[TMP24]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[TMP13]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i3> [[TMP10]] to i24
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i24 [[TMP25]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
-; CHECK:       16:
+; CHECK:       17:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X4]], <8 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> zeroinitializer
@@ -14091,21 +14101,22 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X4:%.*]], <16 x float> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i4> [[TMP7]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
-; CHECK:       10:
+; CHECK:       11:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X4]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
@@ -14130,18 +14141,19 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X0]] to <8 x i3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X4:%.*]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i3> [[TMP13]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X4]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -14163,17 +14175,18 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X3:%.*]], <16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i4> [[TMP8]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X3]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -14186,18 +14199,19 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X4:%.*]], <16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i4> [[TMP13]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X4]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index 8cfca3b07300f..3c1af6781f0ed 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -5495,28 +5495,29 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       5:
 ; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X3:%.*]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i4> [[TMP13]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP12:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       12:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X3]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
 ;
@@ -5529,8 +5530,8 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5543,8 +5544,8 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i4> [[TMP18]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP19]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
@@ -5577,20 +5578,21 @@ define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X3:%.*]], <8 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i3> [[TMP6]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X3]], <8 x double> [[X2:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP9]]
 ;
@@ -5605,10 +5607,10 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1]], <8 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i3> [[TMP7]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP21]], 0
@@ -5645,20 +5647,21 @@ define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X3:%.*]], <16 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i4> [[TMP6]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X3]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP9]]
 ;
@@ -5673,10 +5676,10 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x i4> [[TMP7]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP21]], 0
@@ -5713,17 +5716,18 @@ define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X3:%.*]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i3> [[TMP8]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X3]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
 ;
@@ -5735,11 +5739,11 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i3> [[TMP13]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -5769,29 +5773,30 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK:       6:
 ; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X4:%.*]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i4> [[TMP18]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP19]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
-; CHECK:       12:
+; CHECK:       13:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X4]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -5816,14 +5821,15 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
@@ -5833,19 +5839,19 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP6]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM2:%.*]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <8 x i64> [[X0]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP24]], <8 x i64> [[X0]], <8 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP24]], <8 x i64> [[X4:%.*]], <8 x double> [[TMP13]])
 ; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x double> [[TMP14]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i3> [[TMP11]] to i24
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i24 [[TMP26]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP27:%.*]], label [[TMP28:%.*]], !prof [[PROF1]]
-; CHECK:       17:
+; CHECK:       18:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X4]], <8 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
@@ -5871,21 +5877,22 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X4:%.*]], <16 x float> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i4> [[TMP7]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
-; CHECK:       10:
+; CHECK:       11:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X4]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
@@ -5908,18 +5915,19 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X0]] to <8 x i3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X4:%.*]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i3> [[TMP13]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X4]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -5941,17 +5949,18 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X3:%.*]], <16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i4> [[TMP8]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X3]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -5963,18 +5972,19 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X0:%.*]] to <16 x i4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X4:%.*]], <16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i4> [[TMP13]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP14]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X4]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -8478,19 +8488,20 @@ declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i3> [[TMP2]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X3:%.*]], <8 x i64> [[X2]])
 ; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[RES1]]
 ;
@@ -8501,21 +8512,22 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X3:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i3> [[TMP5]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X3:%.*]], <8 x i64> [[X1]])
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X4:%.*]], <8 x i64> [[X3]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP6]], <8 x i64> [[TMP4]]
@@ -8538,20 +8550,21 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0,
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i3> [[TMP4]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X2:%.*]], <8 x i64> [[X1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X3:%.*]], <8 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
@@ -8575,19 +8588,20 @@ declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i4> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
+; CHECK:       9:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X3:%.*]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RES1]]
 ;
@@ -8598,21 +8612,22 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X3:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i4> [[TMP5]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X3:%.*]], <16 x i32> [[X1]])
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X4:%.*]], <16 x i32> [[X3]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP6]], <16 x i32> [[TMP4]]
@@ -8635,20 +8650,21 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0,
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[X1:%.*]] to <16 x i4>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i4> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X2:%.*]], <16 x i32> [[X1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X3:%.*]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index 18441b2d7e253..02df9c49a010b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -5108,17 +5108,18 @@ define <32 x i16> @test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X0]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
+; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X3:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i5> [[TMP3]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X0]], <32 x i16> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X3]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    store <32 x i16> [[TMP100]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP103]]
 ;
@@ -5130,18 +5131,19 @@ define <32 x i16> @test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <3
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X0]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
+; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i5> [[TMP5]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X0]], <32 x i16> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X4]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP106:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP107:%.*]] = select <32 x i1> [[TMP106]], <32 x i16> [[TMP101]], <32 x i16> [[TMP1]]
@@ -5163,18 +5165,19 @@ define <32 x i16> @test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X0]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
+; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i5> [[TMP5]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X0]], <32 x i16> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X4]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP106:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP107:%.*]] = select <32 x i1> [[TMP106]], <32 x i16> [[TMP101]], <32 x i16> zeroinitializer
@@ -5196,17 +5199,18 @@ define <32 x i16> @test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X1:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X1]] to <32 x i5>
+; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X3:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i5> [[TMP3]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1]], <32 x i16> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X3]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    store <32 x i16> [[TMP100]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP103]]
 ;
@@ -5218,11 +5222,11 @@ define <32 x i16> @test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <3
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X1:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[TMP3]] to <32 x i5>
+; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i5> [[TMP5]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
index e4c1e71721030..78c272c7b2c5a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
@@ -1477,17 +1477,18 @@ define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i1
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X0]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
+; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X3:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i5> [[TMP3]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X0]], <32 x i16> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X3]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    store <32 x i16> [[TMP100]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP103]]
 ;
@@ -1499,18 +1500,19 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X0]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
+; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i5> [[TMP5]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X0]], <32 x i16> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X4]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP106:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP107:%.*]] = select <32 x i1> [[TMP106]], <32 x i16> [[TMP101]], <32 x i16> [[TMP1]]
@@ -1532,18 +1534,19 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X0]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
+; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i5> [[TMP5]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X0]], <32 x i16> [[X2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP104:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X1:%.*]], <32 x i16> [[X4]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP106:%.*]] = bitcast i32 [[X3:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP107:%.*]] = select <32 x i1> [[TMP106]], <32 x i16> [[TMP101]], <32 x i16> zeroinitializer
@@ -1567,17 +1570,18 @@ define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i1
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X1:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X1]] to <32 x i5>
+; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X3:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i5> [[TMP3]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1]], <32 x i16> [[X2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP103:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X3]], <32 x i16> [[X2:%.*]])
 ; CHECK-NEXT:    store <32 x i16> [[TMP100]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[TMP103]]
 ;
@@ -1589,11 +1593,11 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X1:%.*]] to <32 x i5>
-; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1]], <32 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[TMP3]] to <32 x i5>
+; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1:%.*]], <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i5> [[TMP5]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index 3bbbabcb29778..db0c2e7ae9ed6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -1902,16 +1902,17 @@ define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X1]], <4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1926,10 +1927,10 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X1]], <4 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i2> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP13]], 0
@@ -1964,16 +1965,17 @@ define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[X0]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X0]], <4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1988,17 +1990,18 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[X0]] to <4 x i2>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i2> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2026,17 +2029,18 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[X0]] to <4 x i2>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i2> [[TMP14]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP15]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2065,16 +2069,17 @@ define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[X1]] to <8 x i3>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X1]], <8 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i3> [[TMP3]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -2089,10 +2094,10 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[X1]] to <8 x i3>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X1]], <8 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i3> [[TMP9]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP13]], 0
@@ -2124,16 +2129,17 @@ define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i3
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[X0]] to <8 x i3>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X0]], <8 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i3> [[TMP3]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -2148,17 +2154,18 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[X0]] to <8 x i3>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i3> [[TMP9]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2183,17 +2190,18 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <8 x i32> [[X0]] to <8 x i3>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i3> [[TMP14]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP15]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2219,19 +2227,20 @@ define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[X1]] to <2 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP6]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to <2 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <2 x double>
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP8]], <2 x i64> [[X1]], <2 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP10]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i1> [[TMP3]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP11]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
-; CHECK:       [[BB9]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB10]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -2249,7 +2258,7 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc <2 x i64> [[X1]] to <2 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <2 x i64> [[TMP13]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to <2 x double>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to <2 x double>
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP9]], <2 x i64> [[X1]], <2 x double> [[TMP12]])
@@ -2293,19 +2302,20 @@ define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to <4 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP4]] to <4 x double>
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP8]], <4 x i64> [[X1]], <4 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP10]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP11]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
-; CHECK:       [[BB9]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB10]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
@@ -2323,7 +2333,7 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i64> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to <4 x double>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to <4 x double>
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP9]], <4 x i64> [[X1]], <4 x double> [[TMP12]])
@@ -2367,19 +2377,20 @@ define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to <4 x float>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP8]], <4 x i32> [[X1]], <4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP10]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP11]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
-; CHECK:       [[BB9]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB10]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -2397,7 +2408,7 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to <4 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP9]], <4 x i32> [[X1]], <4 x float> [[TMP12]])
@@ -2445,7 +2456,7 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[X1CAST]] to <4 x i2>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x i32> [[TMP13]] to <4 x float>
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP16]], <4 x i32> [[X1CAST]], <4 x float> [[TMP18]])
@@ -2490,19 +2501,20 @@ define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[X1]] to <8 x i3>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to <8 x float>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP8]], <8 x i32> [[X1]], <8 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP10]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i3> [[TMP3]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP11]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
-; CHECK:       [[BB9]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB10]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -2520,7 +2532,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc <8 x i32> [[X1]] to <8 x i3>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to <8 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP9]], <8 x i32> [[X1]], <8 x float> [[TMP12]])
@@ -2561,16 +2573,17 @@ define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[X1]] to <2 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP8]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X1]], <2 x i64> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i1> [[TMP3]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
@@ -2585,10 +2598,10 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <2 x i64> [[X1]] to <2 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X1]], <2 x i64> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i1> [[TMP9]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP13]], 0
@@ -2623,16 +2636,17 @@ define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[X0]] to <2 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP8]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X0]], <2 x i64> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i1> [[TMP3]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
@@ -2647,17 +2661,18 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <2 x i64> [[X0]] to <2 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i1> [[TMP9]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2685,17 +2700,18 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <2 x i64> [[X0]] to <2 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i1> [[TMP14]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP15]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2724,16 +2740,17 @@ define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP8]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X1]], <4 x i64> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2748,10 +2765,10 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[X1]] to <4 x i2>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X1]], <4 x i64> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i2> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP13]], 0
@@ -2786,16 +2803,17 @@ define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[X0]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP8]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X0]], <4 x i64> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2810,17 +2828,18 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[X0]] to <4 x i2>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i2> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2848,17 +2867,18 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[X0]] to <4 x i2>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i2> [[TMP14]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP15]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
index 400499e5a9d93..429829ef39ab9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
@@ -53,7 +53,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <2 x i64> [[T]] to <2 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i1>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i1> [[TMP10]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP12]], 0
@@ -85,7 +85,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <2 x i64> [[T]] to <2 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i1>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i1> [[TMP10]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP12]], 0
@@ -146,7 +146,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[T]] to <4 x i2>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i2>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[T]], <4 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i2> [[TMP10]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP12]], 0
@@ -207,7 +207,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i64> [[T]] to <8 x i3>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i64> [[TMP9]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP6]], <8 x i64> [[T]], <8 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i3> [[TMP10]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP12]], 0
@@ -272,7 +272,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i32> [[T]] to <4 x i2>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i2>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[T]], <4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i2> [[TMP10]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP12]], 0
@@ -333,7 +333,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i32> [[T]] to <8 x i3>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i32> [[TMP9]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[T]], <8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i3> [[TMP10]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP12]], 0
@@ -394,7 +394,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i32> [[T]] to <16 x i4>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i32> [[TMP9]] to <16 x i4>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP6]], <16 x i32> [[T]], <16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i4> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP12]], 0
@@ -459,7 +459,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i16> [[M]], <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i16> [[T]] to <8 x i3>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <8 x i16> [[TMP9]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP6]], <8 x i16> [[T]], <8 x i16> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i3> [[TMP10]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP12]], 0
@@ -520,7 +520,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i16> [[M]], <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i16> [[T]] to <16 x i4>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i16> [[TMP9]] to <16 x i4>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP6]], <16 x i16> [[T]], <16 x i16> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i4> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP12]], 0
@@ -581,7 +581,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <32 x i16> [[M]], <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <32 x i16> [[T]] to <32 x i5>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <32 x i16> [[TMP9]] to <32 x i5>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP6]], <32 x i16> [[T]], <32 x i16> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i5> [[TMP10]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP12]], 0
@@ -646,7 +646,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i8> [[M]], <i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i8> [[T]] to <16 x i4>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i8> [[TMP9]] to <16 x i4>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP6]], <16 x i8> [[T]], <16 x i8> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i4> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP12]], 0
@@ -707,7 +707,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <32 x i8> [[M]], <i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <32 x i8> [[T]] to <32 x i5>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <32 x i8> [[TMP9]] to <32 x i5>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP6]], <32 x i8> [[T]], <32 x i8> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <32 x i5> [[TMP10]] to i160
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP12]], 0
@@ -768,7 +768,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <64 x i8> [[M]], <i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <64 x i8> [[T]] to <64 x i6>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <64 x i8> [[TMP9]] to <64 x i6>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP6]], <64 x i8> [[T]], <64 x i8> [[TMP3]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <64 x i6> [[TMP10]] to i384
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i384 [[TMP12]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
index 34cf24c7208b7..cbc556f8a8ee2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
@@ -987,20 +987,21 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[A1:%.*]] to <2 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[A1]] to <2 x i1>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <2 x i64> [[TMP1]] to <2 x double>
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A0]], <2 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[RES]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i1> [[TMP3]] to i2
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i2 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A2:%.*]], <2 x i64> [[A1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[A3:%.*]], <2 x i64> [[A2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES1]]
 ;
@@ -1013,20 +1014,21 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
 define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[A1:%.*]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[A1]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A0]], <4 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A2:%.*]], <4 x i64> [[A1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[A3:%.*]], <4 x i64> [[A2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES1]]
 ;
@@ -1054,20 +1056,21 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
 define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i2>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[A1]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP1]] to <4 x float>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i2> [[TMP3]] to i8
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A2:%.*]], <4 x i32> [[A1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A3:%.*]], <4 x i32> [[A2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES1]]
 ;
@@ -1091,7 +1094,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[A2]] to <4 x i2>
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[_MSLD]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[A0]], <4 x i32> [[A2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[RES]] to <4 x i32>
@@ -1116,20 +1119,21 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
 define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[A1:%.*]] to <8 x i3>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[A1]] to <8 x i3>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0]], <8 x i32> [[A1]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A0]], <8 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i3> [[TMP3]] to i24
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i24 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A2:%.*]], <8 x i32> [[A1]])
+; CHECK:       10:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[A3:%.*]], <8 x i32> [[A2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES1]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/or.ll b/llvm/test/Instrumentation/MemorySanitizer/or.ll
index 27a1800aa495b..20993a54187ac 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/or.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/or.ll
@@ -29,28 +29,41 @@ define i8 @test_or(i8 %a, i8 %b) sanitize_memory {
 }
 
 define i8 @test_disjoint_or(i8 %a, i8 %b) sanitize_memory {
-; CHECK-LABEL: define i8 @test_disjoint_or(
-; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[B]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i8 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or i8 [[TMP8]], [[TMP7]]
+; CHECK-IMPRECISE-LABEL: define i8 @test_disjoint_or(
+; CHECK-IMPRECISE-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-IMPRECISE-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-IMPRECISE-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-IMPRECISE-NEXT:    call void @llvm.donothing()
+; CHECK-IMPRECISE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], -1
+; CHECK-IMPRECISE-NEXT:    [[TMP4:%.*]] = xor i8 [[B]], -1
+; CHECK-IMPRECISE-NEXT:    [[TMP5:%.*]] = and i8 [[TMP1]], [[TMP2]]
+; CHECK-IMPRECISE-NEXT:    [[TMP6:%.*]] = and i8 [[TMP3]], [[TMP2]]
+; CHECK-IMPRECISE-NEXT:    [[TMP7:%.*]] = and i8 [[TMP1]], [[TMP4]]
+; CHECK-IMPRECISE-NEXT:    [[TMP8:%.*]] = or i8 [[TMP5]], [[TMP6]]
+; CHECK-IMPRECISE-NEXT:    [[TMP9:%.*]] = or i8 [[TMP8]], [[TMP7]]
+; CHECK-IMPRECISE-NEXT:    [[C:%.*]] = or disjoint i8 [[A]], [[B]]
+; CHECK-IMPRECISE-NEXT:    store i8 [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-IMPRECISE-NEXT:    ret i8 [[C]]
 ;
-; CHECK-IMPRECISE:      [[C:%.*]] = or disjoint i8 [[A]], [[B]]
-; CHECK-IMPRECISE-NEXT: store i8 [[TMP11]], ptr @__msan_retval_tls, align 8
-;
-; CHECK-PRECISE:         [[TMP10:%.*]] = and i8 [[A]], [[B]]
-; CHECK-PRECISE-NEXT:    [[TMP12:%.*]] = or i8 [[TMP11]], [[TMP10]]
+; CHECK-PRECISE-LABEL: define i8 @test_disjoint_or(
+; CHECK-PRECISE-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-PRECISE-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-PRECISE-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-PRECISE-NEXT:    call void @llvm.donothing()
+; CHECK-PRECISE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], -1
+; CHECK-PRECISE-NEXT:    [[TMP4:%.*]] = xor i8 [[B]], -1
+; CHECK-PRECISE-NEXT:    [[TMP5:%.*]] = and i8 [[TMP1]], [[TMP2]]
+; CHECK-PRECISE-NEXT:    [[TMP6:%.*]] = and i8 [[TMP3]], [[TMP2]]
+; CHECK-PRECISE-NEXT:    [[TMP7:%.*]] = and i8 [[TMP1]], [[TMP4]]
+; CHECK-PRECISE-NEXT:    [[TMP8:%.*]] = or i8 [[TMP5]], [[TMP6]]
+; CHECK-PRECISE-NEXT:    [[TMP9:%.*]] = or i8 [[TMP8]], [[TMP7]]
+; CHECK-PRECISE-NEXT:    [[TMP10:%.*]] = and i8 [[A]], [[B]]
+; CHECK-PRECISE-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-PRECISE-NEXT:    [[TMP12:%.*]] = sext i1 [[TMP11]] to i8
+; CHECK-PRECISE-NEXT:    [[_MS_DISJOINT:%.*]] = or i8 [[TMP9]], [[TMP12]]
 ; CHECK-PRECISE-NEXT:    [[C:%.*]] = or disjoint i8 [[A]], [[B]]
-; CHECK-PRECISE-NEXT:    store i8 [[TMP12]], ptr @__msan_retval_tls, align 8
-;
-; CHECK-NEXT:    ret i8 [[C]]
+; CHECK-PRECISE-NEXT:    store i8 [[_MS_DISJOINT]], ptr @__msan_retval_tls, align 8
+; CHECK-PRECISE-NEXT:    ret i8 [[C]]
 ;
   %c = or disjoint i8 %a, %b
   ret i8 %c
diff --git a/llvm/test/MC/AArch64/directives-case_insensitive.s b/llvm/test/MC/AArch64/directives-case_insensitive.s
index 35a90a1bffea8..c2bdec73e349e 100644
--- a/llvm/test/MC/AArch64/directives-case_insensitive.s
+++ b/llvm/test/MC/AArch64/directives-case_insensitive.s
@@ -15,8 +15,8 @@ tlbi vmalle1os
 .INST 0x5e104020
 // CHECK: .inst 0x5e104020
 
-.RELOC 0, R_AARCH64_NONE, 8
-// CHECK: .reloc 0, R_AARCH64_NONE, 8
+.RELOC ., R_AARCH64_NONE, 8
+// CHECK: .reloc {{.*}}, R_AARCH64_NONE, 8
 
 .HWORD 0x1234
 // CHECK: .hword  4660
diff --git a/llvm/test/MC/AArch64/reloc-directive-err.s b/llvm/test/MC/AArch64/reloc-directive-err.s
index 6eec2ae10c0a6..883dd2a06f28e 100644
--- a/llvm/test/MC/AArch64/reloc-directive-err.s
+++ b/llvm/test/MC/AArch64/reloc-directive-err.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=aarch64 %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/AArch64/reloc-directive.s b/llvm/test/MC/AArch64/reloc-directive.s
index 09b0f0d3cb9d3..a502201fb9291 100644
--- a/llvm/test/MC/AArch64/reloc-directive.s
+++ b/llvm/test/MC/AArch64/reloc-directive.s
@@ -2,32 +2,32 @@
 
 # RUN: llvm-mc -filetype=obj -triple=aarch64-linux-musl %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_AARCH64_NONE, .data
-# PRINT: .reloc 4, R_AARCH64_NONE, foo+4
-# PRINT: .reloc 0, R_AARCH64_NONE, 8
-# PRINT: .reloc 0, R_AARCH64_ABS64, .data+2
-# PRINT: .reloc 0, R_AARCH64_TLSDESC, foo+3
-# PRINT: .reloc 0, R_AARCH64_IRELATIVE, 5
-# PRINT: .reloc 0, BFD_RELOC_NONE, 9
-# PRINT: .reloc 0, BFD_RELOC_16, 9
-# PRINT: .reloc 0, BFD_RELOC_32, 9
-# PRINT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_AARCH64_NONE, .data
+# PRINT: .reloc {{.*}}+4, R_AARCH64_NONE, foo+4
+# PRINT: .reloc {{.*}}+0, R_AARCH64_NONE, 8
+# PRINT: .reloc {{.*}}+0, R_AARCH64_ABS64, .data+2
+# PRINT: .reloc {{.*}}+0, R_AARCH64_TLSDESC, foo+3
+# PRINT: .reloc {{.*}}+0, R_AARCH64_IRELATIVE, 5
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_64, 9
 .text
+  .reloc .+8, R_AARCH64_NONE, .data
+  .reloc .+4, R_AARCH64_NONE, foo+4
+  .reloc .+0, R_AARCH64_NONE, 8
+
+  .reloc .+0, R_AARCH64_ABS64, .data+2
+  .reloc .+0, R_AARCH64_TLSDESC, foo+3
+  .reloc .+0, R_AARCH64_IRELATIVE, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   ret
   nop
   nop
-  .reloc 8, R_AARCH64_NONE, .data
-  .reloc 4, R_AARCH64_NONE, foo+4
-  .reloc 0, R_AARCH64_NONE, 8
-
-  .reloc 0, R_AARCH64_ABS64, .data+2
-  .reloc 0, R_AARCH64_TLSDESC, foo+3
-  .reloc 0, R_AARCH64_IRELATIVE, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/AMDGPU/fixup64.s b/llvm/test/MC/AMDGPU/fixup64.s
new file mode 100644
index 0000000000000..f86d8b2d69f49
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/fixup64.s
@@ -0,0 +1,27 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti -show-encoding %s | FileCheck --check-prefix=SI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti -show-encoding %s 2>&1 | FileCheck --check-prefix=SI-ERR --implicit-check-not=error: --strict-whitespace %s
+
+.LL1:
+.LL2:
+s_mov_b64 vcc, .LL2-.LL1
+// GFX1250:         s_mov_b64 vcc, .LL2-.LL1        ; encoding: [0xfe,0x01,0xea,0xbe,A,A,A,A,A,A,A,A]
+// GFX1250-NEXT:                                    ;   fixup A - offset: 4, value: .LL2-.LL1, kind: FK_Data_8
+// SI-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_mov_b32 s0, .LL2-.LL1
+// SI: s_mov_b32 s0, .LL2-.LL1                      ; encoding: [0xff,0x03,0x80,0xbe,A,A,A,A]
+// SI-NEXT:                                         ;   fixup A - offset: 4, value: .LL2-.LL1, kind: FK_Data_4
+
+// GFX1250: s_mov_b32 s0, .LL2-.LL1                 ; encoding: [0xff,0x00,0x80,0xbe,A,A,A,A]
+// GFX1250-NEXT:                                    ;   fixup A - offset: 4, value: .LL2-.LL1, kind: FK_Data_4
+
+s_mov_b64 s[0:1], sym@abs64
+// GFX1250:        s_mov_b64 s[0:1], sym@abs64      ; encoding: [0xfe,0x01,0x80,0xbe,A,A,A,A,A,A,A,A]
+// GFX1250-NEXT:                                    ;   fixup A - offset: 4, value: sym@abs64, kind: FK_Data_8
+// SI-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_mov_b64 s[0:1], callee@rel64
+// GFX1250: s_mov_b64 s[0:1], callee@rel64          ; encoding: [0xfe,0x01,0x80,0xbe,A,A,A,A,A,A,A,A]
+// GFX1250-NEXT:                                    ;   fixup A - offset: 4, value: callee@rel64, kind: FK_PCRel_8
+// SI-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
index 3d6af6ba6dbf8..6bb0f4b1dff2d 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
@@ -3628,18 +3628,6 @@ v_alignbit_b32 v5, v1, v2, exec_lo
 v_alignbit_b32 v5, v1, v2, exec_hi
 // GFX10: encoding: [0x05,0x00,0x4e,0xd5,0x01,0x05,0xfe,0x01]
 
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1]
-// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1]
-// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1]
-// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-
 v_alignbyte_b32 v5, v1, v2, v3
 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04]
 
@@ -3727,18 +3715,6 @@ v_alignbyte_b32 v5, v1, v2, exec_lo
 v_alignbyte_b32 v5, v1, v2, exec_hi
 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0xfe,0x01]
 
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1]
-// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1]
-// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1]
-// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-
 v_mullit_f32 v5, v1, v2, v3
 // GFX10: encoding: [0x05,0x00,0x50,0xd5,0x01,0x05,0x0e,0x04]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
index 56c8d7ec07496..41b6e93357a3f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sop1.s
@@ -1,6 +1,22 @@
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+s_add_pc_i64 s[2:3]
+// GFX1250: s_add_pc_i64 s[2:3]                     ; encoding: [0x02,0x4b,0x80,0xbe]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_add_pc_i64 4
+// GFX1250: s_add_pc_i64 4                          ; encoding: [0x84,0x4b,0x80,0xbe]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_add_pc_i64 100
+// GFX1250: s_add_pc_i64 0x64                       ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_add_pc_i64 0x12345678abcd0
+// GFX1250: s_add_pc_i64 lit64(0x12345678abcd0)            ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 s_get_pc_i64 s[2:3]
 // GFX1250: s_get_pc_i64 s[2:3]                     ; encoding: [0x00,0x47,0x82,0xbe]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
index 07b4055f0ab9c..737d7b3de4e92 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
@@ -20,3 +20,135 @@ tensor_stop
 tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS
 // GFX1250: tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x1b,0xee,0x00,0x00,0x3c,0x00,0x00,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_atomic_add_f32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_add_u32 v2, v3, s[2:3] offset:-64
+// GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_and_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3]
+// GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_dec_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_inc_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_i32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_i32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_or_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_u32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_swap_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_xor_b32 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64
+// GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64
+// GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
index cf8ad0e8b7b65..279bb262bff04 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -28,6 +28,96 @@ v_mov_b64 v[4:5], 0.5
 v_mov_b64 v[254:255], 0xaf123456
 // GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
+v_tanh_f32 v5, v1
+// GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, v255
+// GFX1250: v_tanh_f32_e32 v5, v255                 ; encoding: [0xff,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, s1
+// GFX1250: v_tanh_f32_e32 v5, s1                   ; encoding: [0x01,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, s105
+// GFX1250: v_tanh_f32_e32 v5, s105                 ; encoding: [0x69,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_lo
+// GFX1250: v_tanh_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_hi
+// GFX1250: v_tanh_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, ttmp15
+// GFX1250: v_tanh_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, m0
+// GFX1250: v_tanh_f32_e32 v5, m0                   ; encoding: [0x7d,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_lo
+// GFX1250: v_tanh_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_hi
+// GFX1250: v_tanh_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, null
+// GFX1250: v_tanh_f32_e32 v5, null                 ; encoding: [0x7c,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, -1
+// GFX1250: v_tanh_f32_e32 v5, -1                   ; encoding: [0xc1,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, 0.5
+// GFX1250: v_tanh_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, src_scc
+// GFX1250: v_tanh_f32_e32 v5, src_scc              ; encoding: [0xfd,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v255, 0xaf123456
+// GFX1250: v_tanh_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16 v5, v1
+// GFX1250: v_tanh_f16_e32 v5, v1                   ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, v127
+// GFX1250: v_tanh_f16_e32 v5, v127                 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, s1
+// GFX1250: v_tanh_f16_e32 v5, s1                   ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, s105
+// GFX1250: v_tanh_f16_e32 v5, s105                 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_lo
+// GFX1250: v_tanh_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_hi
+// GFX1250: v_tanh_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, ttmp15
+// GFX1250: v_tanh_f16_e32 v5, ttmp15               ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, m0
+// GFX1250: v_tanh_f16_e32 v5, m0                   ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_lo
+// GFX1250: v_tanh_f16_e32 v5, exec_lo              ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_hi
+// GFX1250: v_tanh_f16_e32 v5, exec_hi              ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, null
+// GFX1250: v_tanh_f16_e32 v5, null                 ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, -1
+// GFX1250: v_tanh_f16_e32 v5, -1                   ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, 0.5
+// GFX1250: v_tanh_f16_e32 v5, 0.5                  ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, src_scc
+// GFX1250: v_tanh_f16_e32 v5, src_scc              ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v127, 0x8000
+// GFX1250: v_tanh_f16_e32 v127, 0x8000             ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
 v_tanh_bf16 v5, v1
 // GFX1250: v_tanh_bf16_e32 v5, v1                  ; encoding: [0x01,0x95,0x0a,0x7e]
 
@@ -73,6 +163,321 @@ v_tanh_bf16 v5, src_scc
 v_tanh_bf16 v127, 0x8000
 // GFX1250: v_tanh_bf16_e32 v127, 0x8000            ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
+v_rcp_bf16 v5, v1
+// GFX1250: v_rcp_bf16_e32 v5, v1                   ; encoding: [0x01,0xf3,0x0a,0x7e]
+
+v_rcp_bf16 v5, v127
+// GFX1250: v_rcp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf3,0x0a,0x7e]
+
+v_rcp_bf16 v5, s1
+// GFX1250: v_rcp_bf16_e32 v5, s1                   ; encoding: [0x01,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, s105
+// GFX1250: v_rcp_bf16_e32 v5, s105                 ; encoding: [0x69,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, vcc_lo
+// GFX1250: v_rcp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, vcc_hi
+// GFX1250: v_rcp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, ttmp15
+// GFX1250: v_rcp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, m0
+// GFX1250: v_rcp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, exec_lo
+// GFX1250: v_rcp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, exec_hi
+// GFX1250: v_rcp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, null
+// GFX1250: v_rcp_bf16_e32 v5, null                 ; encoding: [0x7c,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, -1
+// GFX1250: v_rcp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, 0.5
+// GFX1250: v_rcp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, src_scc
+// GFX1250: v_rcp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v127, 0x8000
+// GFX1250: v_rcp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_sqrt_bf16 v5, v1
+// GFX1250: v_sqrt_bf16_e32 v5, v1                  ; encoding: [0x01,0xf5,0x0a,0x7e]
+
+v_sqrt_bf16 v5, v127
+// GFX1250: v_sqrt_bf16_e32 v5, v127                ; encoding: [0x7f,0xf5,0x0a,0x7e]
+
+v_sqrt_bf16 v5, s1
+// GFX1250: v_sqrt_bf16_e32 v5, s1                  ; encoding: [0x01,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, s105
+// GFX1250: v_sqrt_bf16_e32 v5, s105                ; encoding: [0x69,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, vcc_lo
+// GFX1250: v_sqrt_bf16_e32 v5, vcc_lo              ; encoding: [0x6a,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, vcc_hi
+// GFX1250: v_sqrt_bf16_e32 v5, vcc_hi              ; encoding: [0x6b,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, ttmp15
+// GFX1250: v_sqrt_bf16_e32 v5, ttmp15              ; encoding: [0x7b,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, m0
+// GFX1250: v_sqrt_bf16_e32 v5, m0                  ; encoding: [0x7d,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, exec_lo
+// GFX1250: v_sqrt_bf16_e32 v5, exec_lo             ; encoding: [0x7e,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, exec_hi
+// GFX1250: v_sqrt_bf16_e32 v5, exec_hi             ; encoding: [0x7f,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, null
+// GFX1250: v_sqrt_bf16_e32 v5, null                ; encoding: [0x7c,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, -1
+// GFX1250: v_sqrt_bf16_e32 v5, -1                  ; encoding: [0xc1,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, 0.5
+// GFX1250: v_sqrt_bf16_e32 v5, 0.5                 ; encoding: [0xf0,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, src_scc
+// GFX1250: v_sqrt_bf16_e32 v5, src_scc             ; encoding: [0xfd,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v127, 0x8000
+// GFX1250: v_sqrt_bf16_e32 v127, 0x8000            ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_rsq_bf16 v5, v1
+// GFX1250: v_rsq_bf16_e32 v5, v1                   ; encoding: [0x01,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, v127
+// GFX1250: v_rsq_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, s1
+// GFX1250: v_rsq_bf16_e32 v5, s1                   ; encoding: [0x01,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, s105
+// GFX1250: v_rsq_bf16_e32 v5, s105                 ; encoding: [0x69,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, ttmp15
+// GFX1250: v_rsq_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, m0
+// GFX1250: v_rsq_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_lo
+// GFX1250: v_rsq_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_hi
+// GFX1250: v_rsq_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, null
+// GFX1250: v_rsq_bf16_e32 v5, null                 ; encoding: [0x7c,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, -1
+// GFX1250: v_rsq_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, 0.5
+// GFX1250: v_rsq_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, src_scc
+// GFX1250: v_rsq_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v127, 0x8000
+// GFX1250: v_rsq_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_log_bf16 v5, v1
+// GFX1250: v_log_bf16_e32 v5, v1                   ; encoding: [0x01,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, v127
+// GFX1250: v_log_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, s1
+// GFX1250: v_log_bf16_e32 v5, s1                   ; encoding: [0x01,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, s105
+// GFX1250: v_log_bf16_e32 v5, s105                 ; encoding: [0x69,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_lo
+// GFX1250: v_log_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_hi
+// GFX1250: v_log_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, ttmp15
+// GFX1250: v_log_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, m0
+// GFX1250: v_log_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_lo
+// GFX1250: v_log_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_hi
+// GFX1250: v_log_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, null
+// GFX1250: v_log_bf16_e32 v5, null                 ; encoding: [0x7c,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, -1
+// GFX1250: v_log_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, 0.5
+// GFX1250: v_log_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, src_scc
+// GFX1250: v_log_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf8,0x0a,0x7e]
+
+v_log_bf16 v127, 0x8000
+// GFX1250: v_log_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_exp_bf16 v5, v1
+// GFX1250: v_exp_bf16_e32 v5, v1                   ; encoding: [0x01,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, v127
+// GFX1250: v_exp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, s1
+// GFX1250: v_exp_bf16_e32 v5, s1                   ; encoding: [0x01,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, s105
+// GFX1250: v_exp_bf16_e32 v5, s105                 ; encoding: [0x69,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_lo
+// GFX1250: v_exp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_hi
+// GFX1250: v_exp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, ttmp15
+// GFX1250: v_exp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, m0
+// GFX1250: v_exp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_lo
+// GFX1250: v_exp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_hi
+// GFX1250: v_exp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, null
+// GFX1250: v_exp_bf16_e32 v5, null                 ; encoding: [0x7c,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, -1
+// GFX1250: v_exp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, 0.5
+// GFX1250: v_exp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, src_scc
+// GFX1250: v_exp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v127, 0x8000
+// GFX1250: v_exp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_sin_bf16 v5, v1
+// GFX1250: v_sin_bf16_e32 v5, v1                   ; encoding: [0x01,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, v127
+// GFX1250: v_sin_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, s1
+// GFX1250: v_sin_bf16_e32 v5, s1                   ; encoding: [0x01,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, s105
+// GFX1250: v_sin_bf16_e32 v5, s105                 ; encoding: [0x69,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_lo
+// GFX1250: v_sin_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_hi
+// GFX1250: v_sin_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, ttmp15
+// GFX1250: v_sin_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, m0
+// GFX1250: v_sin_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_lo
+// GFX1250: v_sin_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_hi
+// GFX1250: v_sin_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, null
+// GFX1250: v_sin_bf16_e32 v5, null                 ; encoding: [0x7c,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, -1
+// GFX1250: v_sin_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, 0.5
+// GFX1250: v_sin_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, src_scc
+// GFX1250: v_sin_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v127, 0x8000
+// GFX1250: v_sin_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_cos_bf16 v5, v1
+// GFX1250: v_cos_bf16_e32 v5, v1                   ; encoding: [0x01,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, v127
+// GFX1250: v_cos_bf16_e32 v5, v127                 ; encoding: [0x7f,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, s1
+// GFX1250: v_cos_bf16_e32 v5, s1                   ; encoding: [0x01,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, s105
+// GFX1250: v_cos_bf16_e32 v5, s105                 ; encoding: [0x69,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_lo
+// GFX1250: v_cos_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_hi
+// GFX1250: v_cos_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, ttmp15
+// GFX1250: v_cos_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, m0
+// GFX1250: v_cos_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_lo
+// GFX1250: v_cos_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_hi
+// GFX1250: v_cos_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, null
+// GFX1250: v_cos_bf16_e32 v5, null                 ; encoding: [0x7c,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, -1
+// GFX1250: v_cos_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, 0.5
+// GFX1250: v_cos_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, src_scc
+// GFX1250: v_cos_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v127, 0x8000
+// GFX1250: v_cos_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
 v_cvt_f32_bf16 v5, v1
 // GFX1250: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index 055f540fc2203..76272d25d92d4 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -28,6 +28,99 @@ v_mov_b64 v[4:5], 0.5
 v_mov_b64 v[254:255], 0xaf123456
 // GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
+v_tanh_f32 v5, v1
+// GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, v255
+// GFX1250: v_tanh_f32_e32 v5, v255                 ; encoding: [0xff,0x3d,0x0a,0x7e]
+
+v_tanh_f32 v5, s1
+// GFX1250: v_tanh_f32_e32 v5, s1                   ; encoding: [0x01,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, s105
+// GFX1250: v_tanh_f32_e32 v5, s105                 ; encoding: [0x69,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_lo
+// GFX1250: v_tanh_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, vcc_hi
+// GFX1250: v_tanh_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, ttmp15
+// GFX1250: v_tanh_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, m0
+// GFX1250: v_tanh_f32_e32 v5, m0                   ; encoding: [0x7d,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_lo
+// GFX1250: v_tanh_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, exec_hi
+// GFX1250: v_tanh_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, null
+// GFX1250: v_tanh_f32_e32 v5, null                 ; encoding: [0x7c,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, -1
+// GFX1250: v_tanh_f32_e32 v5, -1                   ; encoding: [0xc1,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, 0.5
+// GFX1250: v_tanh_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v5, src_scc
+// GFX1250: v_tanh_f32_e32 v5, src_scc              ; encoding: [0xfd,0x3c,0x0a,0x7e]
+
+v_tanh_f32 v255, 0xaf123456
+// GFX1250: v_tanh_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16 v5, v1
+// GFX1250: v_tanh_f16_e32 v5, v1                   ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, v127
+// GFX1250: v_tanh_f16_e32 v5, v127                 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, s1
+// GFX1250: v_tanh_f16_e32 v5, s1                   ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, s105
+// GFX1250: v_tanh_f16_e32 v5, s105                 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_lo
+// GFX1250: v_tanh_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_hi
+// GFX1250: v_tanh_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, ttmp15
+// GFX1250: v_tanh_f16_e32 v5, ttmp15               ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, m0
+// GFX1250: v_tanh_f16_e32 v5, m0                   ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_lo
+// GFX1250: v_tanh_f16_e32 v5, exec_lo              ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_hi
+// GFX1250: v_tanh_f16_e32 v5, exec_hi              ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, null
+// GFX1250: v_tanh_f16_e32 v5, null                 ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, -1
+// GFX1250: v_tanh_f16_e32 v5, -1                   ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, 0.5
+// GFX1250: v_tanh_f16_e32 v5, 0.5                  ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, src_scc
+// GFX1250: v_tanh_f16_e32 v5, src_scc              ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v127, 0x8000
+// GFX1250: v_tanh_f16_e32 v127, 0x8000             ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_tanh_f16 v5.h, v1.h
+// GFX1250: v_tanh_f16_e32 v5.h, v1.h               ; encoding: [0x81,0x3f,0x0a,0x7f]
+
 v_tanh_bf16 v5, v1
 // GFX1250: v_tanh_bf16_e32 v5, v1                  ; encoding: [0x01,0x95,0x0a,0x7e]
 
@@ -76,6 +169,342 @@ v_tanh_bf16 v127, 0x8000
 v_tanh_bf16 v5.h, v1.h
 // GFX1250: v_tanh_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0x95,0x0a,0x7f]
 
+v_rcp_bf16 v5, v1
+// GFX1250: v_rcp_bf16_e32 v5, v1                   ; encoding: [0x01,0xf3,0x0a,0x7e]
+
+v_rcp_bf16 v5, v127
+// GFX1250: v_rcp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf3,0x0a,0x7e]
+
+v_rcp_bf16 v5, s1
+// GFX1250: v_rcp_bf16_e32 v5, s1                   ; encoding: [0x01,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, s105
+// GFX1250: v_rcp_bf16_e32 v5, s105                 ; encoding: [0x69,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, vcc_lo
+// GFX1250: v_rcp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, vcc_hi
+// GFX1250: v_rcp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, ttmp15
+// GFX1250: v_rcp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, m0
+// GFX1250: v_rcp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, exec_lo
+// GFX1250: v_rcp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, exec_hi
+// GFX1250: v_rcp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, null
+// GFX1250: v_rcp_bf16_e32 v5, null                 ; encoding: [0x7c,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, -1
+// GFX1250: v_rcp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, 0.5
+// GFX1250: v_rcp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v5, src_scc
+// GFX1250: v_rcp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf2,0x0a,0x7e]
+
+v_rcp_bf16 v127, 0x8000
+// GFX1250: v_rcp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_rcp_bf16 v5.h, v1.h
+// GFX1250: v_rcp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf3,0x0a,0x7f]
+
+v_sqrt_bf16 v5, v1
+// GFX1250: v_sqrt_bf16_e32 v5, v1                  ; encoding: [0x01,0xf5,0x0a,0x7e]
+
+v_sqrt_bf16 v5, v127
+// GFX1250: v_sqrt_bf16_e32 v5, v127                ; encoding: [0x7f,0xf5,0x0a,0x7e]
+
+v_sqrt_bf16 v5, s1
+// GFX1250: v_sqrt_bf16_e32 v5, s1                  ; encoding: [0x01,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, s105
+// GFX1250: v_sqrt_bf16_e32 v5, s105                ; encoding: [0x69,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, vcc_lo
+// GFX1250: v_sqrt_bf16_e32 v5, vcc_lo              ; encoding: [0x6a,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, vcc_hi
+// GFX1250: v_sqrt_bf16_e32 v5, vcc_hi              ; encoding: [0x6b,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, ttmp15
+// GFX1250: v_sqrt_bf16_e32 v5, ttmp15              ; encoding: [0x7b,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, m0
+// GFX1250: v_sqrt_bf16_e32 v5, m0                  ; encoding: [0x7d,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, exec_lo
+// GFX1250: v_sqrt_bf16_e32 v5, exec_lo             ; encoding: [0x7e,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, exec_hi
+// GFX1250: v_sqrt_bf16_e32 v5, exec_hi             ; encoding: [0x7f,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, null
+// GFX1250: v_sqrt_bf16_e32 v5, null                ; encoding: [0x7c,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, -1
+// GFX1250: v_sqrt_bf16_e32 v5, -1                  ; encoding: [0xc1,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, 0.5
+// GFX1250: v_sqrt_bf16_e32 v5, 0.5                 ; encoding: [0xf0,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v5, src_scc
+// GFX1250: v_sqrt_bf16_e32 v5, src_scc             ; encoding: [0xfd,0xf4,0x0a,0x7e]
+
+v_sqrt_bf16 v127, 0x8000
+// GFX1250: v_sqrt_bf16_e32 v127, 0x8000            ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_sqrt_bf16 v5.h, v1.h
+// GFX1250: v_sqrt_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0xf5,0x0a,0x7f]
+
+v_rsq_bf16 v5, v1
+// GFX1250: v_rsq_bf16_e32 v5, v1                   ; encoding: [0x01,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, v127
+// GFX1250: v_rsq_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf7,0x0a,0x7e]
+
+v_rsq_bf16 v5, s1
+// GFX1250: v_rsq_bf16_e32 v5, s1                   ; encoding: [0x01,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, s105
+// GFX1250: v_rsq_bf16_e32 v5, s105                 ; encoding: [0x69,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, ttmp15
+// GFX1250: v_rsq_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, m0
+// GFX1250: v_rsq_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_lo
+// GFX1250: v_rsq_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, exec_hi
+// GFX1250: v_rsq_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, null
+// GFX1250: v_rsq_bf16_e32 v5, null                 ; encoding: [0x7c,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, -1
+// GFX1250: v_rsq_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, 0.5
+// GFX1250: v_rsq_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v5, src_scc
+// GFX1250: v_rsq_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf6,0x0a,0x7e]
+
+v_rsq_bf16 v127, 0x8000
+// GFX1250: v_rsq_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_rsq_bf16 v5.h, v1.h
+// GFX1250: v_rsq_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf7,0x0a,0x7f]
+
+v_log_bf16 v5, v1
+// GFX1250: v_log_bf16_e32 v5, v1                   ; encoding: [0x01,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, v127
+// GFX1250: v_log_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, s1
+// GFX1250: v_log_bf16_e32 v5, s1                   ; encoding: [0x01,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, s105
+// GFX1250: v_log_bf16_e32 v5, s105                 ; encoding: [0x69,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_lo
+// GFX1250: v_log_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_hi
+// GFX1250: v_log_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, ttmp15
+// GFX1250: v_log_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, m0
+// GFX1250: v_log_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_lo
+// GFX1250: v_log_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_hi
+// GFX1250: v_log_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, null
+// GFX1250: v_log_bf16_e32 v5, null                 ; encoding: [0x7c,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, -1
+// GFX1250: v_log_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, 0.5
+// GFX1250: v_log_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, src_scc
+// GFX1250: v_log_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf8,0x0a,0x7e]
+
+v_log_bf16 v127, 0x8000
+// GFX1250: v_log_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_log_bf16 v5.h, v1.h
+// GFX1250: v_log_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf9,0x0a,0x7f]
+
+v_exp_bf16 v5, v1
+// GFX1250: v_exp_bf16_e32 v5, v1                   ; encoding: [0x01,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, v127
+// GFX1250: v_exp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfb,0x0a,0x7e]
+
+v_exp_bf16 v5, s1
+// GFX1250: v_exp_bf16_e32 v5, s1                   ; encoding: [0x01,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, s105
+// GFX1250: v_exp_bf16_e32 v5, s105                 ; encoding: [0x69,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_lo
+// GFX1250: v_exp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, vcc_hi
+// GFX1250: v_exp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, ttmp15
+// GFX1250: v_exp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, m0
+// GFX1250: v_exp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_lo
+// GFX1250: v_exp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, exec_hi
+// GFX1250: v_exp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, null
+// GFX1250: v_exp_bf16_e32 v5, null                 ; encoding: [0x7c,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, -1
+// GFX1250: v_exp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, 0.5
+// GFX1250: v_exp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v5, src_scc
+// GFX1250: v_exp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfa,0x0a,0x7e]
+
+v_exp_bf16 v127, 0x8000
+// GFX1250: v_exp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_exp_bf16 v5.h, v1.h
+// GFX1250: v_exp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfb,0x0a,0x7f]
+
+v_sin_bf16 v5, v1
+// GFX1250: v_sin_bf16_e32 v5, v1                   ; encoding: [0x01,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, v127
+// GFX1250: v_sin_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfd,0x0a,0x7e]
+
+v_sin_bf16 v5, s1
+// GFX1250: v_sin_bf16_e32 v5, s1                   ; encoding: [0x01,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, s105
+// GFX1250: v_sin_bf16_e32 v5, s105                 ; encoding: [0x69,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_lo
+// GFX1250: v_sin_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, vcc_hi
+// GFX1250: v_sin_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, ttmp15
+// GFX1250: v_sin_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, m0
+// GFX1250: v_sin_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_lo
+// GFX1250: v_sin_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, exec_hi
+// GFX1250: v_sin_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, null
+// GFX1250: v_sin_bf16_e32 v5, null                 ; encoding: [0x7c,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, -1
+// GFX1250: v_sin_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, 0.5
+// GFX1250: v_sin_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v5, src_scc
+// GFX1250: v_sin_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfc,0x0a,0x7e]
+
+v_sin_bf16 v127, 0x8000
+// GFX1250: v_sin_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_sin_bf16 v5.h, v1.h
+// GFX1250: v_sin_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfd,0x0a,0x7f]
+
+v_cos_bf16 v5, v1
+// GFX1250: v_cos_bf16_e32 v5, v1                   ; encoding: [0x01,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, v127
+// GFX1250: v_cos_bf16_e32 v5, v127                 ; encoding: [0x7f,0xff,0x0a,0x7e]
+
+v_cos_bf16 v5, s1
+// GFX1250: v_cos_bf16_e32 v5, s1                   ; encoding: [0x01,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, s105
+// GFX1250: v_cos_bf16_e32 v5, s105                 ; encoding: [0x69,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_lo
+// GFX1250: v_cos_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, vcc_hi
+// GFX1250: v_cos_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, ttmp15
+// GFX1250: v_cos_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, m0
+// GFX1250: v_cos_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_lo
+// GFX1250: v_cos_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, exec_hi
+// GFX1250: v_cos_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, null
+// GFX1250: v_cos_bf16_e32 v5, null                 ; encoding: [0x7c,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, -1
+// GFX1250: v_cos_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, 0.5
+// GFX1250: v_cos_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v5, src_scc
+// GFX1250: v_cos_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfe,0x0a,0x7e]
+
+v_cos_bf16 v127, 0x8000
+// GFX1250: v_cos_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_cos_bf16 v5.h, v1.h
+// GFX1250: v_cos_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xff,0x0a,0x7f]
+
 v_cvt_f32_bf16 v5, v1
 // GFX1250: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
index 4e5754f3961c1..0a8ee84561d33 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
@@ -2,6 +2,118 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -58,6 +170,398 @@ v_tanh_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 f
 // GFX1250: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_mirror
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_half_mirror
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shl:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shl:15
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shr:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shr:15
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_ror:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_ror:15
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_mirror
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_half_mirror
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shl:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shl:15
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shr:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shr:15
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_ror:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_ror:15
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
index a6787254ae60f..d4afb9d9b2d9a 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
@@ -2,6 +2,122 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -62,6 +178,426 @@ v_tanh_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_mirror
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_half_mirror
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shl:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shl:15
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shr:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_shr:15
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_ror:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_ror:15
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rcp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_mirror
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_half_mirror
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shl:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shl:15
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shr:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_shr:15
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_ror:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_ror:15
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sqrt_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shl:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_shr:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:1
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_ror:15
+// GFX1250: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
index e6c35d5e3b863..a7cb6bf8de69c 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
@@ -2,6 +2,30 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -14,6 +38,90 @@ v_tanh_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rcp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sqrt_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
index 98bd1fd41fa60..6acab7edc0d49 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
@@ -2,6 +2,34 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -18,6 +46,118 @@ v_tanh_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rcp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sqrt_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
index cc14e4caf851e..20bc578605b8c 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -2,6 +2,158 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX1200-ERR --implicit-check-not=error: %s
 
+v_fmac_f64 v[4:5], v[2:3], v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], v[2:3], v[4:5]   ; encoding: [0x02,0x09,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[254:255], v[2:3], v[4:5]
+// GFX1250: v_fmac_f64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x2f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[254:255], v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], vcc, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], vcc, v[4:5]      ; encoding: [0x6a,0x08,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], exec, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], exec, v[4:5]     ; encoding: [0x7e,0x08,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], 0, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], 0, v[4:5]        ; encoding: [0x80,0x08,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], -1, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], -1, v[4:5]       ; encoding: [0xc1,0x08,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], 0.5, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], 0.5, v[4:5]      ; encoding: [0xf0,0x08,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], -4.0, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], -4.0, v[4:5]     ; encoding: [0xf7,0x08,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], 0xaf123456, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xff,0x08,0x08,0x2e,0x56,0x34,0x12,0xaf]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], 0x3f717273, v[4:5]
+// GFX1250: v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x2e,0x73,0x72,0x71,0x3f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], v[2:3], v[8:9]   ; encoding: [0x02,0x11,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[254:255], v[2:3], v[8:9]
+// GFX1250: v_fmac_f64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x2f]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[254:255], v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], vcc, v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], vcc, v[8:9]      ; encoding: [0x6a,0x10,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], exec, v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], exec, v[8:9]     ; encoding: [0x7e,0x10,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], 0, v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], 0, v[8:9]        ; encoding: [0x80,0x10,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], -1, v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], -1, v[8:9]       ; encoding: [0xc1,0x10,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], 0.5, v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], 0.5, v[8:9]      ; encoding: [0xf0,0x10,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], -4.0, v[8:9]
+// GFX1250: v_fmac_f64_e32 v[4:5], -4.0, v[8:9]     ; encoding: [0xf7,0x10,0x08,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], v[254:255]
+// GFX1250: v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x2e]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], vcc
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], vcc      ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xd5,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], exec
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], exec     ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xfd,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], 0
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], 0        ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x01,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], -1
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], -1       ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x83,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], 0.5
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], 0.5      ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xe1,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], -4.0
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], -4.0     ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xef,0x01,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], -v[2:3], v[8:9]
+// GFX1250: v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9]  ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x20]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], -v[8:9]
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9]  ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x40]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], -v[2:3], -v[8:9]
+// GFX1250: v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x60]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], |v[2:3]|, v[8:9]
+// GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9] ; encoding: [0x04,0x01,0x17,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], |v[8:9]|
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]| ; encoding: [0x04,0x02,0x17,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], |v[2:3]|, |v[8:9]|
+// GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], v[8:9] clamp
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x17,0xd5,0x02,0x11,0x02,0x00]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], v[8:9] mul:2
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x08]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], v[8:9] mul:4
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x10]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_fmac_f64 v[4:5], v[2:3], v[8:9] div:2
+// GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x18]
+// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3]
 // GFX1250: v_fmamk_f64 v[6:7], v[4:5], 0x405ec000, v[2:3] ; encoding: [0x04,0x05,0x0c,0x46,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
index 7196f407e9038..7486d849253e8 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -127,6 +127,411 @@ v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp
 v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp
 // GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
 
+v_tanh_f32_e64 v5, v1
+// GFX1250: v_tanh_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, v255
+// GFX1250: v_tanh_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, s1
+// GFX1250: v_tanh_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, s105
+// GFX1250: v_tanh_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_lo
+// GFX1250: v_tanh_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_hi
+// GFX1250: v_tanh_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, ttmp15
+// GFX1250: v_tanh_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, m0
+// GFX1250: v_tanh_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_lo
+// GFX1250: v_tanh_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_hi
+// GFX1250: v_tanh_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, null
+// GFX1250: v_tanh_f32_e64 v5, null                 ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, -1
+// GFX1250: v_tanh_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f32_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16_e64 v5, v1
+// GFX1250: v_tanh_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, v255
+// GFX1250: v_tanh_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, s1
+// GFX1250: v_tanh_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, s105
+// GFX1250: v_tanh_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_lo
+// GFX1250: v_tanh_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_hi
+// GFX1250: v_tanh_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, ttmp15
+// GFX1250: v_tanh_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, m0
+// GFX1250: v_tanh_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_lo
+// GFX1250: v_tanh_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_hi
+// GFX1250: v_tanh_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, null
+// GFX1250: v_tanh_f16_e64 v5, null                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, -1
+// GFX1250: v_tanh_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f16_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_rcp_bf16_e64 v5, v1
+// GFX1250: v_rcp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_bf16_e64 v5, v255
+// GFX1250: v_rcp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xf9,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_bf16_e64 v5, s1
+// GFX1250: v_rcp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, s105
+// GFX1250: v_rcp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, vcc_lo
+// GFX1250: v_rcp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xf9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, vcc_hi
+// GFX1250: v_rcp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xf9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, ttmp15
+// GFX1250: v_rcp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, m0
+// GFX1250: v_rcp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xf9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, exec_lo
+// GFX1250: v_rcp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xf9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, exec_hi
+// GFX1250: v_rcp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, null
+// GFX1250: v_rcp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xf9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, -1
+// GFX1250: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, v1
+// GFX1250: v_sqrt_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, v255
+// GFX1250: v_sqrt_bf16_e64 v5, v255                ; encoding: [0x05,0x00,0xfa,0xd5,0xff,0x01,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, s1
+// GFX1250: v_sqrt_bf16_e64 v5, s1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, s105
+// GFX1250: v_sqrt_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, vcc_lo
+// GFX1250: v_sqrt_bf16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xfa,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, vcc_hi
+// GFX1250: v_sqrt_bf16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xfa,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, ttmp15
+// GFX1250: v_sqrt_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, m0
+// GFX1250: v_sqrt_bf16_e64 v5, m0                  ; encoding: [0x05,0x00,0xfa,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, exec_lo
+// GFX1250: v_sqrt_bf16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xfa,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, exec_hi
+// GFX1250: v_sqrt_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, null
+// GFX1250: v_sqrt_bf16_e64 v5, null                ; encoding: [0x05,0x00,0xfa,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, -1
+// GFX1250: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_rsq_bf16_e64 v5, v1
+// GFX1250: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, v255
+// GFX1250: v_rsq_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s1
+// GFX1250: v_rsq_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s105
+// GFX1250: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, ttmp15
+// GFX1250: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, m0
+// GFX1250: v_rsq_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_lo
+// GFX1250: v_rsq_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_hi
+// GFX1250: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, null
+// GFX1250: v_rsq_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, -1
+// GFX1250: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_log_bf16_e64 v5, v1
+// GFX1250: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, v255
+// GFX1250: v_log_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, s1
+// GFX1250: v_log_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, s105
+// GFX1250: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_lo
+// GFX1250: v_log_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_hi
+// GFX1250: v_log_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, ttmp15
+// GFX1250: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, m0
+// GFX1250: v_log_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_lo
+// GFX1250: v_log_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_hi
+// GFX1250: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, null
+// GFX1250: v_log_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, -1
+// GFX1250: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_exp_bf16_e64 v5, v1
+// GFX1250: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, v255
+// GFX1250: v_exp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, s1
+// GFX1250: v_exp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, s105
+// GFX1250: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_lo
+// GFX1250: v_exp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_hi
+// GFX1250: v_exp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, ttmp15
+// GFX1250: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, m0
+// GFX1250: v_exp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_lo
+// GFX1250: v_exp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_hi
+// GFX1250: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, null
+// GFX1250: v_exp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, -1
+// GFX1250: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_sin_bf16_e64 v5, v1
+// GFX1250: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, v255
+// GFX1250: v_sin_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, s1
+// GFX1250: v_sin_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, s105
+// GFX1250: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_lo
+// GFX1250: v_sin_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_hi
+// GFX1250: v_sin_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, ttmp15
+// GFX1250: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, m0
+// GFX1250: v_sin_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_lo
+// GFX1250: v_sin_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_hi
+// GFX1250: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, null
+// GFX1250: v_sin_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, -1
+// GFX1250: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_cos_bf16_e64 v5, v1
+// GFX1250: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, v255
+// GFX1250: v_cos_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, s1
+// GFX1250: v_cos_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, s105
+// GFX1250: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_lo
+// GFX1250: v_cos_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_hi
+// GFX1250: v_cos_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, ttmp15
+// GFX1250: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, m0
+// GFX1250: v_cos_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_lo
+// GFX1250: v_cos_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_hi
+// GFX1250: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, null
+// GFX1250: v_cos_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, -1
+// GFX1250: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
 v_cvt_f32_bf16_e64 v5, v1
 // GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
index 3ea695f10c559..b59b8b31e2d5f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
@@ -130,6 +130,435 @@ v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp
 v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp
 // GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
 
+v_tanh_f32_e64 v5, v1
+// GFX1250: v_tanh_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, v255
+// GFX1250: v_tanh_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f32_e64 v5, s1
+// GFX1250: v_tanh_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, s105
+// GFX1250: v_tanh_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_lo
+// GFX1250: v_tanh_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, vcc_hi
+// GFX1250: v_tanh_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, ttmp15
+// GFX1250: v_tanh_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, m0
+// GFX1250: v_tanh_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_lo
+// GFX1250: v_tanh_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, exec_hi
+// GFX1250: v_tanh_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, null
+// GFX1250: v_tanh_f32_e64 v5, null                 ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, -1
+// GFX1250: v_tanh_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f32_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f32_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2
+// GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+v_tanh_f16_e64 v5, v1
+// GFX1250: v_tanh_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, v255
+// GFX1250: v_tanh_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+
+v_tanh_f16_e64 v5, s1
+// GFX1250: v_tanh_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, s105
+// GFX1250: v_tanh_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_lo
+// GFX1250: v_tanh_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, vcc_hi
+// GFX1250: v_tanh_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, ttmp15
+// GFX1250: v_tanh_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, m0
+// GFX1250: v_tanh_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_lo
+// GFX1250: v_tanh_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, exec_hi
+// GFX1250: v_tanh_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, null
+// GFX1250: v_tanh_f16_e64 v5, null                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, -1
+// GFX1250: v_tanh_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+
+v_tanh_f16_e64 v5, 0.5 mul:2
+// GFX1250: v_tanh_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+
+v_tanh_f16_e64 v5, src_scc mul:4
+// GFX1250: v_tanh_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+
+v_tanh_f16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_tanh_f16 v5.l, v128.h
+// GFX1250: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
+
+v_rcp_bf16_e64 v5, v1
+// GFX1250: v_rcp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
+
+v_rcp_bf16_e64 v5, v255
+// GFX1250: v_rcp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xf9,0xd5,0xff,0x01,0x00,0x00]
+
+v_rcp_bf16_e64 v5, s1
+// GFX1250: v_rcp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, s105
+// GFX1250: v_rcp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, vcc_lo
+// GFX1250: v_rcp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xf9,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, vcc_hi
+// GFX1250: v_rcp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xf9,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, ttmp15
+// GFX1250: v_rcp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, m0
+// GFX1250: v_rcp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xf9,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, exec_lo
+// GFX1250: v_rcp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xf9,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, exec_hi
+// GFX1250: v_rcp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, null
+// GFX1250: v_rcp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xf9,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, -1
+// GFX1250: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rcp_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rcp_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rcp_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_rcp_bf16 v5.h, v128.h
+// GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, v1
+// GFX1250: v_sqrt_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, v255
+// GFX1250: v_sqrt_bf16_e64 v5, v255                ; encoding: [0x05,0x00,0xfa,0xd5,0xff,0x01,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, s1
+// GFX1250: v_sqrt_bf16_e64 v5, s1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, s105
+// GFX1250: v_sqrt_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, vcc_lo
+// GFX1250: v_sqrt_bf16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xfa,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, vcc_hi
+// GFX1250: v_sqrt_bf16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xfa,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, ttmp15
+// GFX1250: v_sqrt_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, m0
+// GFX1250: v_sqrt_bf16_e64 v5, m0                  ; encoding: [0x05,0x00,0xfa,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, exec_lo
+// GFX1250: v_sqrt_bf16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xfa,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, exec_hi
+// GFX1250: v_sqrt_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, null
+// GFX1250: v_sqrt_bf16_e64 v5, null                ; encoding: [0x05,0x00,0xfa,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, -1
+// GFX1250: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sqrt_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sqrt_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_sqrt_bf16 v5.h, v128.h
+// GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, v1
+// GFX1250: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, v255
+// GFX1250: v_rsq_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s1
+// GFX1250: v_rsq_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, s105
+// GFX1250: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_lo
+// GFX1250: v_rsq_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, vcc_hi
+// GFX1250: v_rsq_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, ttmp15
+// GFX1250: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, m0
+// GFX1250: v_rsq_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_lo
+// GFX1250: v_rsq_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, exec_hi
+// GFX1250: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, null
+// GFX1250: v_rsq_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, -1
+// GFX1250: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+
+v_rsq_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+
+v_rsq_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+
+v_rsq_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_rsq_bf16 v5.h, v128.h
+// GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, v1
+// GFX1250: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, v255
+// GFX1250: v_log_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+
+v_log_bf16_e64 v5, s1
+// GFX1250: v_log_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, s105
+// GFX1250: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_lo
+// GFX1250: v_log_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, vcc_hi
+// GFX1250: v_log_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, ttmp15
+// GFX1250: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, m0
+// GFX1250: v_log_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_lo
+// GFX1250: v_log_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, exec_hi
+// GFX1250: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, null
+// GFX1250: v_log_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, -1
+// GFX1250: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+
+v_log_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+
+v_log_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+
+v_log_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_log_bf16 v5.h, v128.h
+// GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, v1
+// GFX1250: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, v255
+// GFX1250: v_exp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+
+v_exp_bf16_e64 v5, s1
+// GFX1250: v_exp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, s105
+// GFX1250: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_lo
+// GFX1250: v_exp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, vcc_hi
+// GFX1250: v_exp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, ttmp15
+// GFX1250: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, m0
+// GFX1250: v_exp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_lo
+// GFX1250: v_exp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, exec_hi
+// GFX1250: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, null
+// GFX1250: v_exp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, -1
+// GFX1250: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+
+v_exp_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+
+v_exp_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+
+v_exp_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_exp_bf16 v5.h, v128.h
+// GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, v1
+// GFX1250: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, v255
+// GFX1250: v_sin_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+
+v_sin_bf16_e64 v5, s1
+// GFX1250: v_sin_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, s105
+// GFX1250: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_lo
+// GFX1250: v_sin_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, vcc_hi
+// GFX1250: v_sin_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, ttmp15
+// GFX1250: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, m0
+// GFX1250: v_sin_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_lo
+// GFX1250: v_sin_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, exec_hi
+// GFX1250: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, null
+// GFX1250: v_sin_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, -1
+// GFX1250: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+
+v_sin_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+
+v_sin_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+
+v_sin_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_sin_bf16 v5.h, v128.h
+// GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, v1
+// GFX1250: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, v255
+// GFX1250: v_cos_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+
+v_cos_bf16_e64 v5, s1
+// GFX1250: v_cos_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, s105
+// GFX1250: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_lo
+// GFX1250: v_cos_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, vcc_hi
+// GFX1250: v_cos_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, ttmp15
+// GFX1250: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, m0
+// GFX1250: v_cos_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_lo
+// GFX1250: v_cos_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, exec_hi
+// GFX1250: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, null
+// GFX1250: v_cos_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, -1
+// GFX1250: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+
+v_cos_bf16_e64 v5, 0.5 mul:2
+// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+
+v_cos_bf16_e64 v5, src_scc mul:4
+// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+
+v_cos_bf16_e64 v255, -|0x8000| clamp div:2
+// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+v_cos_bf16_e64 v5.h, v128.h
+// GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00]
+
 v_cvt_f32_bf16_e64 v5, v1
 // GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
index f7cb2234fa059..f7f20f46161ce 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
@@ -2,6 +2,118 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -58,6 +170,398 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mas
 // GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
index dfa9741c7cf30..e1241b01ccae1 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
@@ -2,6 +2,122 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -62,6 +178,426 @@ v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
index 686209d1f5189..0106175301d20 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
@@ -2,6 +2,38 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -18,6 +50,118 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
index 52cbe92dd5dca..93b86f3ffb841 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
@@ -2,6 +2,42 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
+v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -22,6 +58,146 @@ v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
+v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
 v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
new file mode 100644
index 0000000000000..e81b6a1d13391
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -0,0 +1,1234 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=WAVESIZE-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x04,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse ; encoding: [0x04,0x20,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse ; encoding: [0x04,0x40,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x62,0xcc,0x00,0x11,0x42,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x62,0xcc,0x00,0x11,0x42,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x62,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x62,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x62,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x63,0xcc,0x00,0x11,0x42,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x63,0xcc,0x00,0x11,0x42,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x63,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x63,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x63,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x1a,0x01,0x64,0xcc,0x00,0x11,0x42,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x1a,0x02,0x64,0xcc,0x00,0x11,0x42,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x1a,0x04,0x64,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x1a,0x20,0x64,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x1a,0x40,0x64,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x70,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x70,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x70,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x71,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x71,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x71,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1
+// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1 ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x06,0x1a]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x60,0xcc,0x00,0x11,0x42,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x60,0xcc,0x00,0x11,0x42,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x60,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x60,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x60,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x61,0xcc,0x00,0x11,0x42,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x61,0xcc,0x00,0x11,0x42,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x61,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x61,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x61,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x66,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x66,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x66,0xcc,0x00,0x11,0x82,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x66,0xcc,0x00,0x11,0x82,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28
+// GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+// GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x68,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x68,0xcc,0x00,0x11,0x72,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x68,0xcc,0x00,0x11,0x72,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32
+// GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x69,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x69,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x69,0xcc,0x00,0x11,0x82,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x69,0xcc,0x00,0x11,0x82,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x69,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x69,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x73,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x73,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x73,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x73,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x74,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x74,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x74,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x74,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x75,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x75,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x75,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x75,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x76,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x76,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x76,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x76,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x77,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x77,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x77,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x77,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x78,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x78,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x78,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x78,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x79,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x79,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x79,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x79,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+// GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+// GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32
+// GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x65,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x65,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x65,0xcc,0x00,0x11,0x82,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x65,0xcc,0x00,0x11,0x82,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x65,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x65,0xcc,0x00,0x11,0x82,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28
+// GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x67,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+// GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x67,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x67,0xcc,0x00,0x11,0x72,0x3c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x67,0xcc,0x00,0x11,0x72,0x5c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse
+// GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse ; encoding: [0x18,0x20,0x67,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
+// GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; encoding: [0x18,0x40,0x67,0xcc,0x00,0x11,0x72,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x84,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x84,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x84,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x85,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x85,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x85,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x86,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x86,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x86,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x87,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x87,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x87,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x80,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x80,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x80,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x81,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x81,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x81,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x82,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x82,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x82,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x83,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x83,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x83,0xcc,0x00,0x11,0x42,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19]
+// GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0
+// GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x1b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x9b]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1]
+// GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x1c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
+// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
+// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
new file mode 100644
index 0000000000000..47445d368d3a0
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32_err.s
@@ -0,0 +1,384 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 %s 2>&1 | FileCheck --check-prefix=GFX1250-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], s[16:19]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], s[16:19]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], s[16:19]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], s[16:19]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], s[16:19]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 128
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], s[16:23]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], s[16:19]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 3.0
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], s32
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], 1.0, v32
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], s28
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], 1.0, v28
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], s32
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], 1.0, v32
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], s[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], 1.0, v[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], s[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], 1.0, v[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], s[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], 1.0, v[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], s[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], 1.0, v[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], s[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], 1.0, v[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], s[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], 1.0, v[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], s[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], 1.0, v[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], s[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], 1.0, v[28:29]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], s[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], 1, v[32:33]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], s32
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], 1.0, v32
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], s28
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], 1.0, v28
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,0,1]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[1,0,0]
+// GFX1250-ERR-NEXT: {{^}}                                                          ^
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,1,0]
+// GFX1250-ERR-NEXT: {{^}}                                                          ^
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[1,0,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[1,0,0]
+// GFX1250-ERR-NEXT: {{^}}                                                          ^
+
+v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0]
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+// GFX1250-ERR-NEXT: {{^}}v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,1,0]
+// GFX1250-ERR-NEXT: {{^}}                                                          ^
diff --git a/llvm/test/MC/AMDGPU/gfx7_err_pos.s b/llvm/test/MC/AMDGPU/gfx7_err_pos.s
index 7b6b241e04707..9dcbd4a4074af 100644
--- a/llvm/test/MC/AMDGPU/gfx7_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx7_err_pos.s
@@ -44,16 +44,3 @@ s_load_dword s5, s[2:3], glc
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: cache policy is not supported for SMRD instructions
 // CHECK-NEXT:{{^}}s_load_dword s5, s[2:3], glc
 // CHECK-NEXT:{{^}}                         ^
-
-//==============================================================================
-// not a valid operand
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
-// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK-NEXT:{{^}}                              ^
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
-// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK-NEXT:{{^}}                               ^
diff --git a/llvm/test/MC/AMDGPU/gfx8_err_pos.s b/llvm/test/MC/AMDGPU/gfx8_err_pos.s
index a475c739e690d..1e8457d54049a 100644
--- a/llvm/test/MC/AMDGPU/gfx8_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx8_err_pos.s
@@ -49,13 +49,3 @@ v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERV
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 // CHECK-NEXT:{{^}}v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 src1_sel:WORD_0
 // CHECK-NEXT:{{^}}                           ^
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
-// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK-NEXT:{{^}}                              ^
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
-// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK-NEXT:{{^}}                               ^
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
index a1cd9ce8ef18e..f3f4cae22538a 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
@@ -2829,18 +2829,6 @@ v_alignbit_b32 v5, v1, v2, src_execz
 v_alignbit_b32 v5, v1, v2, src_scc
 // CHECK: [0x05,0x00,0xce,0xd1,0x01,0x05,0xf6,0x03]
 
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
-// CHECK: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
-// CHECK: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
-// CHECK: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
-
-v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
-// CHECK: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
-
 v_alignbyte_b32 v5, v1, v2, v3
 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04]
 
@@ -3012,18 +3000,6 @@ v_alignbyte_b32 v5, v1, v2, src_execz
 v_alignbyte_b32 v5, v1, v2, src_scc
 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xf6,0x03]
 
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1]
-// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1]
-// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1]
-// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-
-v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
-// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-
 v_min3_f32 v5, v1, v2, v3
 // CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04]
 
diff --git a/llvm/test/MC/AMDGPU/reloc-directive.s b/llvm/test/MC/AMDGPU/reloc-directive.s
index 99e972b3105a2..5d55fde4945ef 100644
--- a/llvm/test/MC/AMDGPU/reloc-directive.s
+++ b/llvm/test/MC/AMDGPU/reloc-directive.s
@@ -3,25 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=amdgcn--amdhsa %s -o %t
 # RUN: llvm-readobj -r %t | FileCheck %s
 
-# PRINT:      .reloc 2, R_AMDGPU_NONE, .data
-# PRINT-NEXT: .reloc 1, R_AMDGPU_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_AMDGPU_NONE, 8
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32_LO, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32_HI, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS64, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL64, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL32_LO, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL32_HI, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32_LO, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32_HI, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_RELATIVE64, .data
-# PRINT-NEXT: .reloc 0, R_AMDGPU_REL16, .data
-# PRINT-NEXT: .reloc 0, BFD_RELOC_NONE, .data
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, .data
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, .data
+# PRINT:      .reloc {{.*}}+2, R_AMDGPU_NONE, .data
 
 # CHECK:      0x2 R_AMDGPU_NONE .data
 # CHECK-NEXT: 0x1 R_AMDGPU_NONE foo 0x4
@@ -44,27 +26,27 @@
 # CHECK-NEXT: 0x0 R_AMDGPU_ABS64 .data
 
 .text
+  .reloc .+2, R_AMDGPU_NONE, .data
+  .reloc .+1, R_AMDGPU_NONE, foo+4
+  .reloc .+0, R_AMDGPU_NONE, 8
+  .reloc .+0, R_AMDGPU_ABS32_LO, .data
+  .reloc .+0, R_AMDGPU_ABS32_HI, .data
+  .reloc .+0, R_AMDGPU_ABS64, .data
+  .reloc .+0, R_AMDGPU_REL32, .data
+  .reloc .+0, R_AMDGPU_REL64, .data
+  .reloc .+0, R_AMDGPU_ABS32, .data
+  .reloc .+0, R_AMDGPU_GOTPCREL, .data
+  .reloc .+0, R_AMDGPU_GOTPCREL32_LO, .data
+  .reloc .+0, R_AMDGPU_GOTPCREL32_HI, .data
+  .reloc .+0, R_AMDGPU_REL32_LO, .data
+  .reloc .+0, R_AMDGPU_REL32_HI, .data
+  .reloc .+0, R_AMDGPU_RELATIVE64, .data
+  .reloc .+0, R_AMDGPU_REL16, .data
+  .reloc .+0, BFD_RELOC_NONE, .data
+  .reloc .+0, BFD_RELOC_32, .data
+  .reloc .+0, BFD_RELOC_64, .data
   s_nop 0
   s_nop 0
-  .reloc 2, R_AMDGPU_NONE, .data
-  .reloc 1, R_AMDGPU_NONE, foo+4
-  .reloc 0, R_AMDGPU_NONE, 8
-  .reloc 0, R_AMDGPU_ABS32_LO, .data
-  .reloc 0, R_AMDGPU_ABS32_HI, .data
-  .reloc 0, R_AMDGPU_ABS64, .data
-  .reloc 0, R_AMDGPU_REL32, .data
-  .reloc 0, R_AMDGPU_REL64, .data
-  .reloc 0, R_AMDGPU_ABS32, .data
-  .reloc 0, R_AMDGPU_GOTPCREL, .data
-  .reloc 0, R_AMDGPU_GOTPCREL32_LO, .data
-  .reloc 0, R_AMDGPU_GOTPCREL32_HI, .data
-  .reloc 0, R_AMDGPU_REL32_LO, .data
-  .reloc 0, R_AMDGPU_REL32_HI, .data
-  .reloc 0, R_AMDGPU_RELATIVE64, .data
-  .reloc 0, R_AMDGPU_REL16, .data
-  .reloc 0, BFD_RELOC_NONE, .data
-  .reloc 0, BFD_RELOC_32, .data
-  .reloc 0, BFD_RELOC_64, .data
 
 .data
 .globl foo
diff --git a/llvm/test/MC/ARM/AlignedBundling/group-bundle-arm.s b/llvm/test/MC/ARM/AlignedBundling/group-bundle-arm.s
deleted file mode 100644
index ca78f2acc2894..0000000000000
--- a/llvm/test/MC/ARM/AlignedBundling/group-bundle-arm.s
+++ /dev/null
@@ -1,48 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple armv7-linux-gnueabi %s -o - \
-# RUN:   | llvm-objdump --no-show-raw-insn --triple=armv7 -d - | FileCheck %s
-
-# On ARM each instruction is 4 bytes long so padding for individual
-# instructions should not be inserted. However, for bundle-locked groups
-# it can be.
-
-  .syntax unified
-  .text
-  .bundle_align_mode 4
-
-  bx lr
-  and r1, r1, r2
-  and r1, r1, r2
-  .bundle_lock
-  bx r9
-  bx r8
-  .bundle_unlock
-# CHECK:      c:  nop
-# CHECK-NEXT: 10: bx
-# CHECK-NEXT: 14: bx
-
-  # pow2 here
-  .align 4 
-  bx lr
-  .bundle_lock
-  bx r9
-  bx r9
-  bx r9
-  bx r8
-  .bundle_unlock
-# CHECK:      20: bx
-# CHECK-NEXT: 24: nop
-# CHECK-NEXT: 28: nop
-# CHECK-NEXT: 2c: nop
-# CHECK-NEXT: 30: bx
-
-  .align 4
-foo:
-  b foo
-  .long 3892240112
-  .long 3892240112
-  .long 3892240112
-  .long 3892240112
-  .long 3892240112
-  .long 3892240112
-# CHECK:  40: b
-
diff --git a/llvm/test/MC/ARM/AlignedBundling/illegal-subtarget-change.s b/llvm/test/MC/ARM/AlignedBundling/illegal-subtarget-change.s
deleted file mode 100644
index e831ac5c2af06..0000000000000
--- a/llvm/test/MC/ARM/AlignedBundling/illegal-subtarget-change.s
+++ /dev/null
@@ -1,16 +0,0 @@
-# RUN: not --crash llvm-mc -filetype=obj -triple armv7-linux-gnueabi %s -o - 2>&1 | FileCheck %s
-
-        # We cannot switch subtargets mid-bundle
-        .syntax unified
-        .text
-        .bundle_align_mode 4
-        .arch armv4t
-        bx lr
-        .bundle_lock
-        bx lr
-        .arch armv7a
-        movt r0, #0xffff
-        movw r0, #0xffff
-        .bundle_unlock
-        bx lr
-# CHECK: LLVM ERROR: A Bundle can only have one Subtarget.
diff --git a/llvm/test/MC/ARM/AlignedBundling/lit.local.cfg b/llvm/test/MC/ARM/AlignedBundling/lit.local.cfg
deleted file mode 100644
index 42bf50dcc13c3..0000000000000
--- a/llvm/test/MC/ARM/AlignedBundling/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not "X86" in config.root.targets:
-    config.unsupported = True
diff --git a/llvm/test/MC/ARM/AlignedBundling/pad-align-to-bundle-end.s b/llvm/test/MC/ARM/AlignedBundling/pad-align-to-bundle-end.s
deleted file mode 100644
index c01c8c178afeb..0000000000000
--- a/llvm/test/MC/ARM/AlignedBundling/pad-align-to-bundle-end.s
+++ /dev/null
@@ -1,41 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple armv7-linux-gnueabi %s -o - \
-# RUN:   | llvm-objdump --no-show-raw-insn --triple=armv7 -d - | FileCheck %s
-
-	.syntax unified
-	.text
-  .bundle_align_mode 4
-
-  bx lr
-  and r1, r1, r2
-  and r1, r1, r2
-  .bundle_lock align_to_end
-  bx r9
-  .bundle_unlock
-# No padding required here because bx just happens to be in the
-# right offset.
-# CHECK:      8:  and
-# CHECK-NEXT: c:  bx
-
-  bx lr
-  and r1, r1, r2
-  .bundle_lock align_to_end
-  bx r9
-  .bundle_unlock
-# A 4-byte padding is needed here
-# CHECK:      18: nop
-# CHECK-NEXT: 1c: bx
-
-  bx lr
-  and r1, r1, r2
-  .bundle_lock align_to_end
-  bx r9
-  bx r9
-  bx r9
-  .bundle_unlock
-# A 12-byte padding is needed here to push the group to the end of the next
-# bundle
-# CHECK:      28: nop
-# CHECK-NEXT: 2c: nop
-# CHECK-NEXT: 30: nop
-# CHECK-NEXT: 34: bx
-
diff --git a/llvm/test/MC/ARM/AlignedBundling/subtarget-change.s b/llvm/test/MC/ARM/AlignedBundling/subtarget-change.s
deleted file mode 100644
index 5672560376d46..0000000000000
--- a/llvm/test/MC/ARM/AlignedBundling/subtarget-change.s
+++ /dev/null
@@ -1,33 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple armv7-linux-gnueabi %s -o - \
-# RUN:   | llvm-objdump --no-print-imm-hex --no-show-raw-insn --triple=armv7 -d - | FileCheck %s
-
-        # We can switch subtargets with .arch outside of a bundle
-        .syntax unified
-        .text
-        .bundle_align_mode 4
-        .arch armv4t
-        bx lr
-        .bundle_lock
-        and r1, r1, r1
-        and r1, r1, r1
-        .bundle_unlock
-        bx lr
-
-        # We can switch subtargets at the start of a bundle
-        bx lr
-        .bundle_lock align_to_end
-        .arch armv7a
-        movt r0, #0xffff
-        movw r0, #0xffff
-        .bundle_unlock
-        bx lr
-
-# CHECK:      0: bx    lr
-# CHECK-NEXT: 4: and   r1, r1, r1
-# CHECK-NEXT: 8: and   r1, r1, r1
-# CHECK-NEXT: c: bx    lr
-# CHECK-NEXT: 10: bx    lr
-# CHECK-NEXT: 14: nop
-# CHECK-NEXT: 18: movt  r0, #65535
-# CHECK-NEXT: 1c: movw  r0, #65535
-# CHECK-NEXT: 20: bx    lr
diff --git a/llvm/test/MC/ARM/arm_instructions.s b/llvm/test/MC/ARM/arm_instructions.s
index a4c100ee68f90..ab532ec321078 100644
--- a/llvm/test/MC/ARM/arm_instructions.s
+++ b/llvm/test/MC/ARM/arm_instructions.s
@@ -1,14 +1,8 @@
 @ RUN: llvm-mc -mcpu=cortex-a8 -triple arm-unknown-unknown -show-encoding %s \
 @ RUN:  | FileCheck %s -check-prefix=ALL
-@ RUN: llvm-mc -mcpu=cortex-a9 -triple armv7-unknown-nacl -show-encoding %s \
-@ RUN:  | FileCheck %s -check-prefix=NACL
-@ RUN: llvm-mc -mcpu=cortex-a8 -mattr=+nacl-trap -triple armv7 -show-encoding %s \
-@ RUN:  | FileCheck %s -check-prefix=NACL
 
 @ ALL: trap
 @ ALL: encoding: [0xfe,0xde,0xff,0xe7]
-@ NACL: trap
-@ NACL: encoding: [0xf0,0xde,0xfe,0xe7]
         trap
 
 @ CHECK: bx	lr
diff --git a/llvm/test/MC/ARM/reloc-directive-err.s b/llvm/test/MC/ARM/reloc-directive-err.s
index c291fd62d2ba9..113158c51c36b 100644
--- a/llvm/test/MC/ARM/reloc-directive-err.s
+++ b/llvm/test/MC/ARM/reloc-directive-err.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=armv7 %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=armv7 %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/ARM/reloc-directive.s b/llvm/test/MC/ARM/reloc-directive.s
index 682f0e1185c72..6a3b2496cfc8d 100644
--- a/llvm/test/MC/ARM/reloc-directive.s
+++ b/llvm/test/MC/ARM/reloc-directive.s
@@ -10,21 +10,21 @@
 # RUN: llvm-readelf -x .data %t | FileCheck --check-prefix=HEX %s
 
 .text
+  .reloc .+8, R_ARM_NONE, .data
+  .reloc .+4, R_ARM_NONE, foo+4
+  .reloc .+0, R_ARM_NONE, 8
+
+  .reloc .+0, R_ARM_ALU_PC_G0, .data+2
+  .reloc .+0, R_ARM_LDR_PC_G0, foo+3
+  .reloc .+0, R_ARM_THM_ALU_PREL_11_0, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   bx lr
   nop
   nop
-  .reloc 8, R_ARM_NONE, .data
-  .reloc 4, R_ARM_NONE, foo+4
-  .reloc 0, R_ARM_NONE, 8
-
-  .reloc 0, R_ARM_ALU_PC_G0, .data+2
-  .reloc 0, R_ARM_LDR_PC_G0, foo+3
-  .reloc 0, R_ARM_THM_ALU_PREL_11_0, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
@@ -33,16 +33,7 @@ foo:
   .word 0
   .word 0
 
-# PRINT: .reloc 8, R_ARM_NONE, .data
-# PRINT: .reloc 4, R_ARM_NONE, foo+4
-# PRINT: .reloc 0, R_ARM_NONE, 8
-# PRINT: .reloc 0, R_ARM_ALU_PC_G0, .data+2
-# PRINT: .reloc 0, R_ARM_LDR_PC_G0, foo+3
-# PRINT: .reloc 0, R_ARM_THM_ALU_PREL_11_0, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+8, R_ARM_NONE, .data
 
 # ARM relocations use the Elf32_Rel format. Addends are neither stored in the
 # relocation entries nor applied in the referenced locations.
diff --git a/llvm/test/MC/AVR/reloc-directive-err.s b/llvm/test/MC/AVR/reloc-directive-err.s
index d660bde487e3a..8494a66c6b727 100644
--- a/llvm/test/MC/AVR/reloc-directive-err.s
+++ b/llvm/test/MC/AVR/reloc-directive-err.s
@@ -1,10 +1,10 @@
 # RUN: llvm-mc -triple=avr %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=avr %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[#@LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
 
-# PRINT: .reloc 0, BFD_RELOC_64, 0
+# PRINT: .reloc {{.*}}, BFD_RELOC_64, 0
 # CHECK: {{.*}}.s:[[#@LINE+1]]:11: error: unknown relocation name
-.reloc 0, BFD_RELOC_64, 0
+.reloc ., BFD_RELOC_64, 0
diff --git a/llvm/test/MC/AVR/reloc-directive.s b/llvm/test/MC/AVR/reloc-directive.s
index 60913172502cf..9940842171eef 100644
--- a/llvm/test/MC/AVR/reloc-directive.s
+++ b/llvm/test/MC/AVR/reloc-directive.s
@@ -1,14 +1,7 @@
 # RUN: llvm-mc -triple=avr %s | FileCheck --check-prefix=PRINT %s
 # RUN: llvm-mc -filetype=obj -triple=avr %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT:      .reloc 4, R_AVR_NONE, .data
-# PRINT-NEXT: .reloc 2, R_AVR_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_AVR_NONE, 8
-# PRINT:      .reloc 0, R_AVR_32, .data+2
-# PRINT-NEXT: .reloc 0, R_AVR_16, foo+3
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
+# PRINT:      .reloc {{.*}}+4, R_AVR_NONE, .data
 
 # CHECK:      Section ({{.*}}) .rela.text {
 # CHECK-NEXT:   0x4 R_AVR_NONE .data 0x0
@@ -22,19 +15,19 @@
 # CHECK-NEXT: }
 
 .text
+  .reloc .+4, R_AVR_NONE, .data
+  .reloc .+2, R_AVR_NONE, foo+4
+  .reloc .+0, R_AVR_NONE, 8
+
+  .reloc .+0, R_AVR_32, .data+2
+  .reloc .+0, R_AVR_16, foo+3
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   ret
   nop
   nop
-  .reloc 4, R_AVR_NONE, .data
-  .reloc 2, R_AVR_NONE, foo+4
-  .reloc 0, R_AVR_NONE, 8
-
-  .reloc 0, R_AVR_32, .data+2
-  .reloc 0, R_AVR_16, foo+3
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/AsmParser/AArch64/directive-parse-err.s b/llvm/test/MC/AsmParser/AArch64/directive-parse-err.s
index 02cdfd7829198..5e2dd4adcdf8c 100644
--- a/llvm/test/MC/AsmParser/AArch64/directive-parse-err.s
+++ b/llvm/test/MC/AsmParser/AArch64/directive-parse-err.s
@@ -1,5 +1,5 @@
 // RUN: not llvm-mc -triple aarch64 %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -triple aarch64 %s 2>&1 | grep "error:" | count 60
+// RUN: not llvm-mc -triple aarch64 %s 2>&1 | grep "error:" | count 59
 
 	// CHECK: [[#@LINE+1]]:19: error: expected newline
 	.equ   ident1, 0 $
@@ -175,11 +175,6 @@
 	// CHECK-NOT: :[[#@LINE+1]]:{{[0-9]+}}: error:
 	.cv_loc 1 1 // EOL COMMENT
 	
-	// CHECK: [[#@LINE+1]]:28: error: expected newline
-	.bundle_lock align_to_end $
-	// CHECK-NOT: [[#@LINE+1]]:{{[0-9]+}}: error:
-	.bundle_lock align_to_end // EOL COMMENT	
-	
 	// CHECK: [[#@LINE+1]]:11: error: invalid token in expression
 	.sleb128 $
 	// CHECK-NOT: :[[#@LINE+1]]:{{[0-9]+}}: error:
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
index 08ed50d92ba83..721babdd64245 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
@@ -1146,18 +1146,6 @@
 # GFX10: v_alignbit_b32 v5, vcc_lo, v2, v3       ; encoding: [0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04]
 0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04
 
-# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04
-
-# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04
-
-# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04
-
-# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04
-
 # GFX10: v_alignbyte_b32 v255, v1, v2, v3        ; encoding: [0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04]
 0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04
 
@@ -1245,18 +1233,6 @@
 # GFX10: v_alignbyte_b32 v5, vcc_lo, v2, v3      ; encoding: [0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04]
 0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04
 
-# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04
-
-# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04
-
-# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04
-
-# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04]
-0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04
-
 # GFX10: v_and_b32_e64 v255, v1, v2              ; encoding: [0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00]
 0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
index 89731fcc936e6..83fa647696d6c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
@@ -1,5 +1,17 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
+# GFX1250: s_add_pc_i64 lit64(0x12345678abcd0)            ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00]
+0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00
+
+# GFX1250: s_add_pc_i64 0x64                       ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00]
+0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00
+
+# GFX1250: s_add_pc_i64 4                          ; encoding: [0x84,0x4b,0x80,0xbe]
+0x84,0x4b,0x80,0xbe
+
+# GFX1250: s_add_pc_i64 s[2:3]                     ; encoding: [0x02,0x4b,0x80,0xbe]
+0x02,0x4b,0x80,0xbe
+
 # GFX1250: s_barrier_signal -3                     ; encoding: [0xc3,0x4e,0x80,0xbe]
 0xc3,0x4e,0x80,0xbe
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
index 6421c6f30e177..55bc3e7a5746c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
@@ -1,5 +1,2831 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
+# GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00]
+0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00]
+0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_add_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u32 v[4:5], v5          ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x80,0x02,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x0d,0xec,0x00,0x00,0x80,0x02,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_add_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_add_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_add_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_and_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_and_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v0, v[2:3], v[4:5] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v0, v[2:3], v[4:5] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v1, v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b32 v1, v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v[2:5] offset:-64 ; encoding: [0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v[2:5] offset:64 ; encoding: [0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_dec_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_dec_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_inc_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_inc_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_i64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_i64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_num_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_num_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_max_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_max_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_i64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_i64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_num_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_num_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_min_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_min_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_or_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_or_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2      ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2       ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_clamp_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_sub_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_sub_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:-2048 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0xf8,0xff]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0xf8,0xff
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:2048 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x08,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x08,0x00
+
+# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[4:5], v[6:7] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: flat_atomic_swap_b64 v[2:3], v[4:5], v[6:7] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_atomic_xor_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_atomic_xor_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b128 v[2:5], v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b128 v[2:5], v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b128 v[2:5], v[6:7]           ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v[0:1] offset:-64     ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b32 v1, v[0:1] offset:64      ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5]                ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:-2048   ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0xf8,0xff]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0xf8,0xff
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:-4      ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xfc,0xff,0xff]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xfc,0xff,0xff
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:2047    ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:2048    ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x08,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x08,0x00
+
+# GFX1250: flat_load_b32 v1, v[4:5] offset:4       ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x04,0x00,0x00]
+0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x04,0x00,0x00
+
+# GFX1250: flat_load_b64 v[2:3], v[0:1] offset:-64 ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b64 v[2:3], v[0:1] offset:64  ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b64 v[2:3], v[4:5]            ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_b96 v[2:4], v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_b96 v[2:4], v[0:1] offset:64  ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_b96 v[2:4], v[6:7]            ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_b16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_b16 v1, v[0:1] offset:64  ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_b16 v1, v[4:5]            ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_b16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_hi_b16 v1, v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_b16 v1, v[4:5]         ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_i8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_hi_i8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_i8 v1, v[4:5]          ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_u8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_hi_u8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_hi_u8 v1, v[4:5]          ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_i8 v1, v[0:1] offset:-64  ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_i8 v1, v[0:1] offset:64   ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_i8 v1, v[4:5]             ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_d16_u8 v1, v[0:1] offset:-64  ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_d16_u8 v1, v[0:1] offset:64   ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_d16_u8 v1, v[4:5]             ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_i16 v1, v[0:1] offset:-64     ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_i16 v1, v[0:1] offset:64      ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_i16 v1, v[4:5]                ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_i8 v1, v[0:1] offset:-64      ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_i8 v1, v[0:1] offset:64       ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_i8 v1, v[4:5]                 ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_u16 v1, v[0:1] offset:-64     ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_u16 v1, v[0:1] offset:64      ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_u16 v1, v[4:5]                ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_load_u8 v1, v[0:1] offset:-64      ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_load_u8 v1, v[0:1] offset:64       ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_load_u8 v1, v[4:5]                 ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_b128 v[0:1], v[2:5] offset:-64 ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b128 v[0:1], v[2:5] offset:64 ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b128 v[2:3], v[4:7]          ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_store_b16 v[0:1], v2 offset:-64    ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b16 v[0:1], v2 offset:64     ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b16 v[4:5], v1               ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_b32 v[0:1], v2 offset:-64    ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b32 v[0:1], v2 offset:64     ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b32 v[4:5], v1 offset:-16    ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff
+
+# GFX1250: flat_store_b32 v[4:5], v1 offset:16     ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00]
+0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00
+
+# GFX1250: flat_store_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b64 v[2:3], v[4:5]           ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_store_b8 v[0:1], v2 offset:-64     ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b8 v[0:1], v2 offset:64      ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b8 v[4:5], v1                ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_b96 v[0:1], v[2:4] offset:-64 ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_b96 v[0:1], v[2:4] offset:64 ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_b96 v[2:3], v[4:6]           ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_d16_hi_b16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b16 v[4:5], v1        ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b8 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: flat_store_d16_hi_b8 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: flat_store_d16_hi_b8 v[4:5], v1         ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_add_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_add_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_and_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_and_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0xff,0x07,0x00]
+0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0x00,0x00,0x00]
+0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v1, v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b32 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b32 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v0, v[2:5], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v0, v[2:5], s[0:1] offset:64 ; encoding: [0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[0:1], v[2:5], off offset:-64 ; encoding: [0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v[0:1], v[2:5], off offset:64 ; encoding: [0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v0, v[2:5], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v0, v[2:5], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v3, v[6:9], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00]
+0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v3, v[6:9], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00]
+0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_cond_sub_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_cond_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_dec_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_inc_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_i64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_i64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_num_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_num_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_max_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_max_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_i64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_i64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_num_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_num_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_min_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_min_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_or_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_or_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_bf16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_pk_add_f16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_pk_add_f16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_clamp_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_sub_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v1, v3, s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0xff,0x07,0x00]
+0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v1, v3, s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0x00,0x00,0x00]
+0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v[2:3], v3, off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b32 v0, v[2:3], v3, off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v3, v[6:7], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00]
+0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v3, v[6:7], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00]
+0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[4:5], v[6:7], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00
+
+# GFX1250: global_atomic_swap_b64 v[2:3], v[4:5], v[6:7], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_atomic_xor_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_inv                              ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_inv scope:SCOPE_DEV              ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_inv scope:SCOPE_SYS              ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, off          ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, off offset:-64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_addtid_b32 v1, off offset:64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_addtid_b32 v1, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_addtid_b32 v1, s[2:3]       ; encoding: [0x02,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x02,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b128 v[2:5], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v5, s[2:3]     ; encoding: [0x02,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x02,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b128 v[2:5], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b128 v[2:5], v[6:7], off    ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b32 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v3, s[2:3]          ; encoding: [0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v3, s[2:3] offset:2047 ; encoding: [0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0xff,0x07,0x00]
+0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0xff,0x07,0x00
+
+# GFX1250: global_load_b32 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b32 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v[4:5], off         ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_b32 v1, v[4:5], off offset:2047 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00
+
+# GFX1250: global_load_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b64 v[2:3], v3, s[2:3]      ; encoding: [0x02,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_b64 v[2:3], v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b64 v[2:3], v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b64 v[2:3], v[4:5], off     ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b96 v[2:4], v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v5, s[2:3]      ; encoding: [0x02,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x02,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_b96 v[2:4], v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_b96 v[2:4], v[6:7], off     ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_block v[8:39], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v5, s[2:3]   ; encoding: [0x02,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x02,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_block v[8:39], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v[6:7], off  ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_block v[8:39], v[6:7], off th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x24,0x00,0x06,0x00,0x00,0x00]
+0x7c,0xc0,0x14,0xee,0x08,0x00,0x24,0x00,0x06,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_b16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v3, s[2:3]      ; encoding: [0x02,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_b16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_b16 v1, v[4:5], off     ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_b16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v3, s[2:3]   ; encoding: [0x02,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_b16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_b16 v1, v[4:5], off  ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v3, s[2:3]    ; encoding: [0x02,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_i8 v1, v[4:5], off   ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v3, s[2:3]    ; encoding: [0x02,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_hi_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_hi_u8 v1, v[4:5], off   ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v3, s[2:3]       ; encoding: [0x02,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_i8 v1, v[4:5], off      ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v3, s[2:3]       ; encoding: [0x02,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_d16_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_d16_u8 v1, v[4:5], off      ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v3, s[2:3]          ; encoding: [0x02,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i16 v1, v[4:5], off         ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v3, s[2:3]           ; encoding: [0x02,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_i8 v1, v[4:5], off          ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v3, s[2:3]          ; encoding: [0x02,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u16 v1, v[4:5], off         ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v3, s[2:3]           ; encoding: [0x02,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_load_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: global_load_u8 v1, v[4:5], off          ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_addtid_b32 v2, off offset:-64 ; encoding: [0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_addtid_b32 v2, off offset:64 ; encoding: [0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_addtid_b32 v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_addtid_b32 v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b128 v0, v[2:5], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b128 v0, v[2:5], s[0:1] offset:64 ; encoding: [0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b128 v1, v[4:7], s[2:3]    ; encoding: [0x02,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00]
+0x02,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_b128 v[0:1], v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b128 v[0:1], v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b128 v[2:3], v[4:7], off   ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_b16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b16 v3, v1, s[2:3]         ; encoding: [0x02,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_b16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b16 v[4:5], v1, off        ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b32 v3, v1, s[2:3] offset:-16 ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0xf0,0xff,0xff]
+0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0xf0,0xff,0xff
+
+# GFX1250: global_store_b32 v3, v1, s[2:3] offset:16 ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x10,0x00,0x00]
+0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x10,0x00,0x00
+
+# GFX1250: global_store_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b32 v[4:5], v1, off offset:-16 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff
+
+# GFX1250: global_store_b32 v[4:5], v1, off offset:16 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00]
+0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00
+
+# GFX1250: global_store_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b64 v1, v[2:3], s[2:3]     ; encoding: [0x02,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00]
+0x02,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b64 v[2:3], v[4:5], off    ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_b8 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b8 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b8 v3, v1, s[2:3]          ; encoding: [0x02,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_b8 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b8 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b8 v[4:5], v1, off         ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_b96 v0, v[2:4], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b96 v0, v[2:4], s[0:1] offset:64 ; encoding: [0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b96 v1, v[4:6], s[2:3]     ; encoding: [0x02,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00]
+0x02,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_b96 v[0:1], v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_b96 v[0:1], v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_b96 v[2:3], v[4:6], off    ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_block v0, v[2:33], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_block v0, v[2:33], s[0:1] offset:64 ; encoding: [0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_block v1, v[4:35], s[2:3]  ; encoding: [0x02,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00]
+0x02,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_block v[0:1], v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_block v[0:1], v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_block v[2:3], v[4:35], off ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_block v[2:3], v[4:35], off th:TH_STORE_HT scope:SCOPE_SE ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x24,0x02,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x15,0xee,0x00,0x00,0x24,0x02,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v3, v1, s[2:3]  ; encoding: [0x02,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b16 v[4:5], v1, off ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b8 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v3, v1, s[2:3]   ; encoding: [0x02,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: global_store_d16_hi_b8 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: global_store_d16_hi_b8 v[4:5], v1, off  ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00]
+0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00
+
+# GFX1250: global_wb                               ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wb scope:SCOPE_DEV               ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wb scope:SCOPE_SYS               ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wbinv                            ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wbinv scope:SCOPE_DEV            ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: global_wbinv scope:SCOPE_SYS            ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00]
+0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], off, off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], off, off offset:64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], off, s0 offset:-64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], off, s0 offset:64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], v0, off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], v0, off offset:64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b128 v[2:5], v0, s0 offset:64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b128 v[2:5], v2, s1        ; encoding: [0x01,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, off           ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, off, off offset:2047 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, off, off offset:64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, s0 offset:-64 ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, off, s0 offset:64  ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, off, s1 offset:2047 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, v0, off offset:-64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, v0, off offset:64  ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, v0, s0 offset:-64  ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b32 v1, v0, s0 offset:64   ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, off offset:2047 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00]
+0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1             ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:-4095 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x01,0xf0,0xff]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x01,0xf0,0xff
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:-61440 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x10,0xff]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x10,0xff
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:2047 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:4095 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x0f,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x0f,0x00
+
+# GFX1250: scratch_load_b32 v1, v2, s1 offset:61440 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0xf0,0x00]
+0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0xf0,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], off, off offset:-64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], off, off offset:64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], off, s0 offset:-64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], off, s0 offset:64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], v0, off offset:-64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], v0, off offset:64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], v0, s0 offset:-64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b64 v[2:3], v0, s0 offset:64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b64 v[2:3], v2, s1         ; encoding: [0x01,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], off, off offset:-64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], off, off offset:64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], off, s0 offset:-64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], off, s0 offset:64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], v0, off offset:-64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], v0, off offset:64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], v0, s0 offset:-64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_b96 v[2:4], v0, s0 offset:64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_b96 v[2:4], v2, s1         ; encoding: [0x01,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], off, off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], off, off offset:64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], off, s0 offset:-64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], off, s0 offset:64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v0, off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], v0, off offset:64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_block v[4:35], v0, s0 offset:64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v2, s1      ; encoding: [0x01,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_block v[4:35], v2, s1 th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x01,0xc0,0x14,0xed,0x04,0x00,0x26,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x14,0xed,0x04,0x00,0x26,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, off, off offset:64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, off, s0 offset:-64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, off, s0 offset:64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, v0, off offset:-64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, v0, off offset:64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, v0, s0 offset:-64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_b16 v1, v0, s0 offset:64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_b16 v1, v2, s1         ; encoding: [0x01,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, off, s0 offset:64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, off offset:64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v0, s0 offset:64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_b16 v1, v2, s1      ; encoding: [0x01,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, off, s0 offset:64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, off offset:64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v0, s0 offset:64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_i8 v1, v2, s1       ; encoding: [0x01,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, off offset:64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, s0 offset:-64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, off, s0 offset:64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, off offset:-64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, off offset:64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v0, s0 offset:64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_hi_u8 v1, v2, s1       ; encoding: [0x01,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, off, s0 offset:64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, v0, off offset:64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_i8 v1, v0, s0 offset:64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_i8 v1, v2, s1          ; encoding: [0x01,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, off, s0 offset:64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, v0, off offset:64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_d16_u8 v1, v0, s0 offset:64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_d16_u8 v1, v2, s1          ; encoding: [0x01,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, off, s0 offset:64  ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, v0, off offset:64  ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, v0, s0 offset:-64  ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i16 v1, v0, s0 offset:64   ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i16 v1, v2, s1             ; encoding: [0x01,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, off, off offset:-64 ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, off, off offset:64  ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, off, s0 offset:-64  ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, off, s0 offset:64   ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, v0, off offset:-64  ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, v0, off offset:64   ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, v0, s0 offset:-64   ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_i8 v1, v0, s0 offset:64    ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_i8 v1, v2, s1              ; encoding: [0x01,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, off, s0 offset:64  ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, v0, off offset:64  ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, v0, s0 offset:-64  ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u16 v1, v0, s0 offset:64   ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u16 v1, v2, s1             ; encoding: [0x01,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, off, off offset:64  ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, off, s0 offset:-64  ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, off, s0 offset:64   ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, v0, off offset:-64  ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, v0, off offset:64   ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, v0, s0 offset:-64   ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_load_u8 v1, v0, s0 offset:64    ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_load_u8 v1, v2, s1              ; encoding: [0x01,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00]
+0x01,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b128 off, v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 off, v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 off, v[2:5], s0 offset:-64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 off, v[2:5], s0 offset:64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 v0, v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 v0, v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 v0, v[2:5], s0 offset:-64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b128 v0, v[2:5], s0 offset:64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b128 v1, v[2:5], s3       ; encoding: [0x03,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b16 off, v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 off, v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 off, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 off, v2, s0 offset:64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 v0, v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 v0, v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 v0, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b16 v0, v2, s0 offset:64  ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b16 v1, v2, s3            ; encoding: [0x03,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, off          ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 off, v2, off offset:2047 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b32 off, v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, s0 offset:-64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 off, v2, s0 offset:64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 off, v2, s3 offset:2047 ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00]
+0x03,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b32 v0, v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 v0, v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 v0, v2, s0 offset:-64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b32 v0, v2, s0 offset:64  ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, off offset:2047 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00]
+0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:-4095 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x01,0xf0,0xff]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x01,0xf0,0xff
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:-61440 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x10,0xff]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x10,0xff
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:4095 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x0f,0x00]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x0f,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s1 offset:61440 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0xf0,0x00]
+0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0xf0,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s3            ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b32 v1, v2, s3 offset:2047 ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00]
+0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00
+
+# GFX1250: scratch_store_b64 off, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 off, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 off, v[2:3], s0 offset:-64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 off, v[2:3], s0 offset:64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 v0, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 v0, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 v0, v[2:3], s0 offset:-64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b64 v0, v[2:3], s0 offset:64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b64 v1, v[2:3], s3        ; encoding: [0x03,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b8 off, v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 off, v2, off offset:64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 off, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 off, v2, s0 offset:64  ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 v0, v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 v0, v2, off offset:64  ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 v0, v2, s0 offset:-64  ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b8 v0, v2, s0 offset:64   ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b8 v1, v2, s3             ; encoding: [0x03,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_b96 off, v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 off, v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 off, v[2:4], s0 offset:-64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 off, v[2:4], s0 offset:64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 v0, v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 v0, v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 v0, v[2:4], s0 offset:-64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_b96 v0, v[2:4], s0 offset:64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_b96 v1, v[2:4], s3        ; encoding: [0x03,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_block off, v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block off, v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block off, v[2:33], s0 offset:-64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block off, v[2:33], s0 offset:64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block v0, v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block v0, v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block v0, v[2:33], s0 offset:-64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_block v0, v[2:33], s0 offset:64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_block v1, v[2:33], s3     ; encoding: [0x03,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_block v1, v[2:33], s3 th:TH_STORE_HT scope:SCOPE_SE ; encoding: [0x03,0x00,0x15,0xed,0x00,0x00,0x26,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x15,0xed,0x00,0x00,0x26,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 off, v2, s0 offset:64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b16 v0, v2, s0 offset:64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b16 v1, v2, s3     ; encoding: [0x03,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 off, v2, s0 offset:64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff
+
+# GFX1250: scratch_store_d16_hi_b8 v0, v2, s0 offset:64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00]
+0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00
+
+# GFX1250: scratch_store_d16_hi_b8 v1, v2, s3      ; encoding: [0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00]
+0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00
+
 # GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
index 1b1f65b4398cd..5f37ba91e071b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
@@ -2,6 +2,9 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xff,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf
+# GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+
 0xc1,0x3a,0x08,0x7e
 # GFX1250: v_mov_b64_e32 v[4:5], -1                ; encoding: [0xc1,0x3a,0x08,0x7e]
 
@@ -26,6 +29,114 @@
 0x6a,0x3a,0x08,0x7e
 # GFX1250: v_mov_b64_e32 v[4:5], vcc               ; encoding: [0x6a,0x3a,0x08,0x7e]
 
+0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf
+# GFX1250: v_tanh_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+0xc1,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, -1                   ; encoding: [0xc1,0x3c,0x0a,0x7e]
+
+0xf0,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x3c,0x0a,0x7e]
+
+0x7f,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x3c,0x0a,0x7e]
+
+0x7e,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x3c,0x0a,0x7e]
+
+0x7d,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, m0                   ; encoding: [0x7d,0x3c,0x0a,0x7e]
+
+0x7c,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, null                 ; encoding: [0x7c,0x3c,0x0a,0x7e]
+
+0x01,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, s1                   ; encoding: [0x01,0x3c,0x0a,0x7e]
+
+0x69,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, s105                 ; encoding: [0x69,0x3c,0x0a,0x7e]
+
+0xfd,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, src_scc              ; encoding: [0xfd,0x3c,0x0a,0x7e]
+
+0x7b,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x3c,0x0a,0x7e]
+
+0x01,0x3d,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
+
+0xff,0x3d,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, v255                 ; encoding: [0xff,0x3d,0x0a,0x7e]
+
+0x6b,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x3c,0x0a,0x7e]
+
+0x6a,0x3c,0x0a,0x7e
+# GFX1250: v_tanh_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x3c,0x0a,0x7e]
+
+0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e32 v127.l, 0x8000           ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e32 v127, 0x8000             ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, -1                 ; encoding: [0xc1,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, -1                   ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+0xf0,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, 0.5                ; encoding: [0xf0,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, 0.5                  ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+0x7f,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, exec_hi            ; encoding: [0x7f,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, exec_hi              ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+0x7e,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, exec_lo            ; encoding: [0x7e,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, exec_lo              ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+0x7d,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, m0                 ; encoding: [0x7d,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, m0                   ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+0x7c,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, null               ; encoding: [0x7c,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, null                 ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+0x01,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, s1                 ; encoding: [0x01,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, s1                   ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+0x69,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, s105               ; encoding: [0x69,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, s105                 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+0xfd,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, src_scc            ; encoding: [0xfd,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, src_scc              ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+0x7b,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, ttmp15             ; encoding: [0x7b,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, ttmp15               ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+0x01,0x3f,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, v1.l               ; encoding: [0x01,0x3f,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, v1                   ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+0x7f,0x3f,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, v127.l             ; encoding: [0x7f,0x3f,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, v127                 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+0x6b,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+0x6a,0x3e,0x0a,0x7e
+# GFX1250-REAL16: v_tanh_f16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0x3e,0x0a,0x7e]
+# GFX1250-FAKE16: v_tanh_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+0x81,0x3f,0x0a,0x7f
+# GFX1250-REAL16: v_tanh_f16_e32 v5.h, v1.h               ; encoding: [0x81,0x3f,0x0a,0x7f]
+
 0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e32 v127.l, 0x8000          ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e32 v127, 0x8000            ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
@@ -89,6 +200,447 @@
 0x81,0x95,0x0a,0x7f
 # GFX1250-REAL16: v_tanh_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0x95,0x0a,0x7f]
 
+0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf2,0x0a,0x7e]
+
+0xf0,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf2,0x0a,0x7e]
+
+0x7f,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf2,0x0a,0x7e]
+
+0x7e,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf2,0x0a,0x7e]
+
+0x7d,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf2,0x0a,0x7e]
+
+0x7c,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, null                 ; encoding: [0x7c,0xf2,0x0a,0x7e]
+
+0x01,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, s1                   ; encoding: [0x01,0xf2,0x0a,0x7e]
+
+0x69,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, s105                 ; encoding: [0x69,0xf2,0x0a,0x7e]
+
+0xfd,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf2,0x0a,0x7e]
+
+0x7b,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf2,0x0a,0x7e]
+
+0x01,0xf3,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf3,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, v1                   ; encoding: [0x01,0xf3,0x0a,0x7e]
+
+0x7f,0xf3,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf3,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf3,0x0a,0x7e]
+
+0x6b,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf2,0x0a,0x7e]
+
+0x6a,0xf2,0x0a,0x7e
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf2,0x0a,0x7e]
+# GFX1250-FAKE16: v_rcp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf2,0x0a,0x7e]
+
+0x81,0xf3,0x0a,0x7f
+# GFX1250-REAL16: v_rcp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf3,0x0a,0x7f]
+
+0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e32 v127.l, 0x8000          ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v127, 0x8000            ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, -1                ; encoding: [0xc1,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, -1                  ; encoding: [0xc1,0xf4,0x0a,0x7e]
+
+0xf0,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, 0.5               ; encoding: [0xf0,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, 0.5                 ; encoding: [0xf0,0xf4,0x0a,0x7e]
+
+0x7f,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, exec_hi             ; encoding: [0x7f,0xf4,0x0a,0x7e]
+
+0x7e,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, exec_lo             ; encoding: [0x7e,0xf4,0x0a,0x7e]
+
+0x7d,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, m0                ; encoding: [0x7d,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, m0                  ; encoding: [0x7d,0xf4,0x0a,0x7e]
+
+0x7c,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, null              ; encoding: [0x7c,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, null                ; encoding: [0x7c,0xf4,0x0a,0x7e]
+
+0x01,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, s1                ; encoding: [0x01,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, s1                  ; encoding: [0x01,0xf4,0x0a,0x7e]
+
+0x69,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, s105              ; encoding: [0x69,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, s105                ; encoding: [0x69,0xf4,0x0a,0x7e]
+
+0xfd,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, src_scc           ; encoding: [0xfd,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, src_scc             ; encoding: [0xfd,0xf4,0x0a,0x7e]
+
+0x7b,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, ttmp15              ; encoding: [0x7b,0xf4,0x0a,0x7e]
+
+0x01,0xf5,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, v1.l              ; encoding: [0x01,0xf5,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, v1                  ; encoding: [0x01,0xf5,0x0a,0x7e]
+
+0x7f,0xf5,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, v127.l            ; encoding: [0x7f,0xf5,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, v127                ; encoding: [0x7f,0xf5,0x0a,0x7e]
+
+0x6b,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, vcc_hi              ; encoding: [0x6b,0xf4,0x0a,0x7e]
+
+0x6a,0xf4,0x0a,0x7e
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xf4,0x0a,0x7e]
+# GFX1250-FAKE16: v_sqrt_bf16_e32 v5, vcc_lo              ; encoding: [0x6a,0xf4,0x0a,0x7e]
+
+0x81,0xf5,0x0a,0x7f
+# GFX1250-REAL16: v_sqrt_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0xf5,0x0a,0x7f]
+
+0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf6,0x0a,0x7e]
+
+0xf0,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf6,0x0a,0x7e]
+
+0x7f,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf6,0x0a,0x7e]
+
+0x7e,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf6,0x0a,0x7e]
+
+0x7d,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf6,0x0a,0x7e]
+
+0x7c,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, null                 ; encoding: [0x7c,0xf6,0x0a,0x7e]
+
+0x01,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, s1                   ; encoding: [0x01,0xf6,0x0a,0x7e]
+
+0x69,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, s105                 ; encoding: [0x69,0xf6,0x0a,0x7e]
+
+0xfd,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf6,0x0a,0x7e]
+
+0x7b,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf6,0x0a,0x7e]
+
+0x01,0xf7,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf7,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, v1                   ; encoding: [0x01,0xf7,0x0a,0x7e]
+
+0x7f,0xf7,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf7,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf7,0x0a,0x7e]
+
+0x6b,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf6,0x0a,0x7e]
+
+0x6a,0xf6,0x0a,0x7e
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf6,0x0a,0x7e]
+# GFX1250-FAKE16: v_rsq_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf6,0x0a,0x7e]
+
+0x81,0xf7,0x0a,0x7f
+# GFX1250-REAL16: v_rsq_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf7,0x0a,0x7f]
+
+0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf8,0x0a,0x7e]
+
+0xf0,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf8,0x0a,0x7e]
+
+0x7f,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf8,0x0a,0x7e]
+
+0x7e,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf8,0x0a,0x7e]
+
+0x7d,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf8,0x0a,0x7e]
+
+0x7c,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, null                 ; encoding: [0x7c,0xf8,0x0a,0x7e]
+
+0x01,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, s1                   ; encoding: [0x01,0xf8,0x0a,0x7e]
+
+0x69,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, s105                 ; encoding: [0x69,0xf8,0x0a,0x7e]
+
+0xfd,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf8,0x0a,0x7e]
+
+0x7b,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf8,0x0a,0x7e]
+
+0x01,0xf9,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf9,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, v1                   ; encoding: [0x01,0xf9,0x0a,0x7e]
+
+0x7f,0xf9,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf9,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+
+0x6b,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf8,0x0a,0x7e]
+
+0x6a,0xf8,0x0a,0x7e
+# GFX1250-REAL16: v_log_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf8,0x0a,0x7e]
+# GFX1250-FAKE16: v_log_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf8,0x0a,0x7e]
+
+0x81,0xf9,0x0a,0x7f
+# GFX1250-REAL16: v_log_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf9,0x0a,0x7f]
+
+0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfa,0x0a,0x7e]
+
+0xf0,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfa,0x0a,0x7e]
+
+0x7f,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfa,0x0a,0x7e]
+
+0x7e,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfa,0x0a,0x7e]
+
+0x7d,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfa,0x0a,0x7e]
+
+0x7c,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, null                 ; encoding: [0x7c,0xfa,0x0a,0x7e]
+
+0x01,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, s1                   ; encoding: [0x01,0xfa,0x0a,0x7e]
+
+0x69,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, s105                 ; encoding: [0x69,0xfa,0x0a,0x7e]
+
+0xfd,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfa,0x0a,0x7e]
+
+0x7b,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfa,0x0a,0x7e]
+
+0x01,0xfb,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xfb,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, v1                   ; encoding: [0x01,0xfb,0x0a,0x7e]
+
+0x7f,0xfb,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xfb,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfb,0x0a,0x7e]
+
+0x6b,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfa,0x0a,0x7e]
+
+0x6a,0xfa,0x0a,0x7e
+# GFX1250-REAL16: v_exp_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfa,0x0a,0x7e]
+# GFX1250-FAKE16: v_exp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfa,0x0a,0x7e]
+
+0x81,0xfb,0x0a,0x7f
+# GFX1250-REAL16: v_exp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfb,0x0a,0x7f]
+
+0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfc,0x0a,0x7e]
+
+0xf0,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfc,0x0a,0x7e]
+
+0x7f,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfc,0x0a,0x7e]
+
+0x7e,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfc,0x0a,0x7e]
+
+0x7d,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfc,0x0a,0x7e]
+
+0x7c,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, null                 ; encoding: [0x7c,0xfc,0x0a,0x7e]
+
+0x01,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, s1                   ; encoding: [0x01,0xfc,0x0a,0x7e]
+
+0x69,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, s105                 ; encoding: [0x69,0xfc,0x0a,0x7e]
+
+0xfd,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfc,0x0a,0x7e]
+
+0x7b,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfc,0x0a,0x7e]
+
+0x01,0xfd,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xfd,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, v1                   ; encoding: [0x01,0xfd,0x0a,0x7e]
+
+0x7f,0xfd,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xfd,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfd,0x0a,0x7e]
+
+0x6b,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfc,0x0a,0x7e]
+
+0x6a,0xfc,0x0a,0x7e
+# GFX1250-REAL16: v_sin_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfc,0x0a,0x7e]
+# GFX1250-FAKE16: v_sin_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfc,0x0a,0x7e]
+
+0x81,0xfd,0x0a,0x7f
+# GFX1250-REAL16: v_sin_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfd,0x0a,0x7f]
+
+0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+0xc1,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfe,0x0a,0x7e]
+
+0xf0,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfe,0x0a,0x7e]
+
+0x7f,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfe,0x0a,0x7e]
+
+0x7e,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfe,0x0a,0x7e]
+
+0x7d,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfe,0x0a,0x7e]
+
+0x7c,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, null                 ; encoding: [0x7c,0xfe,0x0a,0x7e]
+
+0x01,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, s1                   ; encoding: [0x01,0xfe,0x0a,0x7e]
+
+0x69,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, s105                 ; encoding: [0x69,0xfe,0x0a,0x7e]
+
+0xfd,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfe,0x0a,0x7e]
+
+0x7b,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfe,0x0a,0x7e]
+
+0x01,0xff,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xff,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, v1                   ; encoding: [0x01,0xff,0x0a,0x7e]
+
+0x7f,0xff,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xff,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, v127                 ; encoding: [0x7f,0xff,0x0a,0x7e]
+
+0x6b,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfe,0x0a,0x7e]
+
+0x6a,0xfe,0x0a,0x7e
+# GFX1250-REAL16: v_cos_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfe,0x0a,0x7e]
+# GFX1250-FAKE16: v_cos_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfe,0x0a,0x7e]
+
+0x81,0xff,0x0a,0x7f
+# GFX1250-REAL16: v_cos_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xff,0x0a,0x7f]
+
 0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00
 # GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000         ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
index db36451c61715..57bee2766ce44 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
@@ -2,6 +2,107 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30
+# GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_tanh_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
 0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30
 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 # GFX1250-FAKE16: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
@@ -61,6 +162,419 @@
 0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff
 # GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 
+0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_rcp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xf2,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xf2,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_sqrt_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_rsq_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_log_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_exp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_sin_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
+0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30
+# GFX1250-REAL16: v_cos_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+
 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30
 # GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
index 4598efc07f489..28ec6b11b4de3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
@@ -2,6 +2,31 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
 0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
@@ -18,6 +43,113 @@
 # GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
 
+0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xf2,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_rcp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_sqrt_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+
+0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
+0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05]
+
 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00
 # GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
index f0fcddb06599f..c1213f2d9ec0d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop2.txt
@@ -1,6 +1,117 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
+0x02,0x09,0xfc,0x2f
+# GFX1250: v_fmac_f64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x2f]
+
+0x02,0x11,0xfc,0x2f
+# GFX1250: v_fmac_f64_e32 v[254:255], v[2:3], v[8:9] ; encoding: [0x02,0x11,0xfc,0x2f]
+
+0xc1,0x08,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], -1, v[4:5]       ; encoding: [0xc1,0x08,0x08,0x2e]
+
+0xc1,0x10,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], -1, v[8:9]       ; encoding: [0xc1,0x10,0x08,0x2e]
+
+0xf7,0x08,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], -4.0, v[4:5]     ; encoding: [0xf7,0x08,0x08,0x2e]
+
+0xf7,0x10,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], -4.0, v[8:9]     ; encoding: [0xf7,0x10,0x08,0x2e]
+
+0x80,0x08,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], 0, v[4:5]        ; encoding: [0x80,0x08,0x08,0x2e]
+
+0x80,0x10,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], 0, v[8:9]        ; encoding: [0x80,0x10,0x08,0x2e]
+
+0xf0,0x08,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], 0.5, v[4:5]      ; encoding: [0xf0,0x08,0x08,0x2e]
+
+0xf0,0x10,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], 0.5, v[8:9]      ; encoding: [0xf0,0x10,0x08,0x2e]
+
+0xff,0x08,0x08,0x2e,0x73,0x72,0x71,0x3f
+# GFX1250: v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x2e,0x73,0x72,0x71,0x3f]
+
+0xff,0x08,0x08,0x2e,0x56,0x34,0x12,0xaf
+# GFX1250: v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xff,0x08,0x08,0x2e,0x56,0x34,0x12,0xaf]
+
+0x7e,0x08,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], exec, v[4:5]     ; encoding: [0x7e,0x08,0x08,0x2e]
+
+0x7e,0x10,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], exec, v[8:9]     ; encoding: [0x7e,0x10,0x08,0x2e]
+
+0xfe,0x09,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x2e]
+
+0xfe,0x11,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], v[254:255], v[8:9] ; encoding: [0xfe,0x11,0x08,0x2e]
+
+0x02,0xfd,0x09,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x2e]
+
+0x02,0x09,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], v[2:3], v[4:5]   ; encoding: [0x02,0x09,0x08,0x2e]
+
+0x02,0x11,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], v[2:3], v[8:9]   ; encoding: [0x02,0x11,0x08,0x2e]
+
+0x6a,0x08,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], vcc, v[4:5]      ; encoding: [0x6a,0x08,0x08,0x2e]
+
+0x6a,0x10,0x08,0x2e
+# GFX1250: v_fmac_f64_e32 v[4:5], vcc, v[8:9]      ; encoding: [0x6a,0x10,0x08,0x2e]
+
+0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x60
+# GFX1250: v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x60]
+
+0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x20
+# GFX1250: v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9]  ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x20]
+
+0x04,0x00,0x17,0xd5,0x02,0x83,0x01,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], -1       ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x83,0x01,0x00]
+
+0x04,0x00,0x17,0xd5,0x02,0xef,0x01,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], -4.0     ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xef,0x01,0x00]
+
+0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x40
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9]  ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x40]
+
+0x04,0x00,0x17,0xd5,0x02,0x01,0x01,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], 0        ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x01,0x01,0x00]
+
+0x04,0x00,0x17,0xd5,0x02,0xe1,0x01,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], 0.5      ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xe1,0x01,0x00]
+
+0x04,0x00,0x17,0xd5,0x02,0xfd,0x00,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], exec     ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xfd,0x00,0x00]
+
+0x04,0x80,0x17,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x17,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x18
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x18]
+
+0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x08
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x08]
+
+0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x10
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4 ; encoding: [0x04,0x00,0x17,0xd5,0x02,0x11,0x02,0x10]
+
+0x04,0x00,0x17,0xd5,0x02,0xd5,0x00,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], vcc      ; encoding: [0x04,0x00,0x17,0xd5,0x02,0xd5,0x00,0x00]
+
+0x04,0x02,0x17,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]| ; encoding: [0x04,0x02,0x17,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x01,0x17,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9] ; encoding: [0x04,0x01,0x17,0xd5,0x02,0x11,0x02,0x00]
+
+0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00
+# GFX1250: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x17,0xd5,0x02,0x11,0x02,0x00]
+
 0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40
 # GFX1250: v_fmaak_f64 v[254:255], 0x405ec000, v[254:255], 0x405ec000 ; encoding: [0xfe,0xfc,0xfd,0x49,0x00,0x00,0x00,0x00,0x00,0xc0,0x5e,0x40]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
index 3e345d73cd996..5004762729701 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
@@ -2,6 +2,115 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
 
+0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf
+# GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+
+0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, -1                   ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, m0                   ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, null                 ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, s1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, s105                 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250: v_tanh_f32_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, v1                   ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, v255                 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00]
+
+0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, null                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64 v5, v128                 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00]
+
 0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
@@ -66,6 +175,454 @@
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, v128                ; encoding: [0x05,0x00,0xca,0xd5,0x80,0x01,0x00,0x00]
 
+0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xf9,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xf9,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xf9,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xf9,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xf9,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xf9,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xf9,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xf9,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xf9,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xf9,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xf9,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xf9,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xf9,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xfa,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xfa,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xfa,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, m0                  ; encoding: [0x05,0x00,0xfa,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, null              ; encoding: [0x05,0x00,0xfa,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, null                ; encoding: [0x05,0x00,0xfa,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xfa,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v255                ; encoding: [0x05,0x00,0xfa,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xfa,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xfa,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xfa,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xfa,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128                ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00]
+
+0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+
+0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+
+0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+
+0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xff,0xd5,0x80,0x01,0x00,0x00]
+
 0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00
 # GFX1250: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
index bae8909bb550e..de908b95d94f9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
@@ -1,6 +1,528 @@
 # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
+
+0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+
+0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
+0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
 0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
index 52f7932205dad..cfe7173c383b3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
@@ -2,6 +2,178 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
 
+0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+
+0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+
+0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+
+0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
+# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
 0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
new file mode 100644
index 0000000000000..d76ec4c7e6185
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -0,0 +1,734 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
+
+0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x08,0x68,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x68,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x02,0x68,0xcc,0x00,0x11,0x72,0x5c
+# GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x68,0xcc,0x00,0x11,0x72,0x5c]
+
+0x18,0x01,0x68,0xcc,0x00,0x11,0x72,0x3c
+# GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x68,0xcc,0x00,0x11,0x72,0x3c]
+
+0x18,0x00,0x69,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x69,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x69,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x69,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x69,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x69,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x69,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x69,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x02,0x69,0xcc,0x00,0x11,0x82,0x5c
+# GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x69,0xcc,0x00,0x11,0x82,0x5c]
+
+0x18,0x01,0x69,0xcc,0x00,0x11,0x82,0x3c
+# GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x69,0xcc,0x00,0x11,0x82,0x3c]
+
+0x18,0x00,0x7a,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x08,0x7a,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x20,0x7a,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x40,0x7a,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x7a,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x00,0x79,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x79,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x08,0x79,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x79,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x20,0x79,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x79,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x40,0x79,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x79,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x00,0x78,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x78,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x08,0x78,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x78,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x20,0x78,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x78,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x40,0x78,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x78,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x00,0x77,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x77,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x08,0x77,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x77,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x20,0x77,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x77,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x40,0x77,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x77,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x00,0x67,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x67,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x08,0x67,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x67,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x20,0x67,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse ; encoding: [0x18,0x20,0x67,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x40,0x67,0xcc,0x00,0x11,0x72,0x1c
+# GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; encoding: [0x18,0x40,0x67,0xcc,0x00,0x11,0x72,0x1c]
+
+0x18,0x02,0x67,0xcc,0x00,0x11,0x72,0x5c
+# GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x67,0xcc,0x00,0x11,0x72,0x5c]
+
+0x18,0x01,0x67,0xcc,0x00,0x11,0x72,0x3c
+# GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x67,0xcc,0x00,0x11,0x72,0x3c]
+
+0x18,0x00,0x76,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x76,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x76,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x76,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x76,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x76,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x76,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x76,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x00,0x75,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x75,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x75,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x75,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x75,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x75,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x75,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x75,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x00,0x74,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x74,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x74,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x74,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x74,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x74,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x74,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x74,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x00,0x73,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x73,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x73,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x73,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x73,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x73,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x73,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x73,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x00,0x66,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x66,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x66,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x66,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x02,0x66,0xcc,0x00,0x11,0x82,0x5c
+# GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x66,0xcc,0x00,0x11,0x82,0x5c]
+
+0x18,0x01,0x66,0xcc,0x00,0x11,0x82,0x3c
+# GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x66,0xcc,0x00,0x11,0x82,0x3c]
+
+0x18,0x00,0x65,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x65,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x65,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x65,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x65,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x65,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x65,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x65,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x02,0x65,0xcc,0x00,0x11,0x82,0x5c
+# GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x65,0xcc,0x00,0x11,0x82,0x5c]
+
+0x18,0x01,0x65,0xcc,0x00,0x11,0x82,0x3c
+# GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x65,0xcc,0x00,0x11,0x82,0x3c]
+
+0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x08,0x7b,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x20,0x7b,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x40,0x7b,0xcc,0x00,0x11,0x82,0x1c
+# GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x7b,0xcc,0x00,0x11,0x82,0x1c]
+
+0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x5c
+# GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x5c]
+
+0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c
+# GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c]
+
+0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x63,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x63,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x63,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x63,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x63,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x63,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x02,0x63,0xcc,0x00,0x11,0x42,0x5c
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x63,0xcc,0x00,0x11,0x42,0x5c]
+
+0x10,0x01,0x63,0xcc,0x00,0x11,0x42,0x3c
+# GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x63,0xcc,0x00,0x11,0x42,0x3c]
+
+0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x1b]
+
+0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x9b]
+
+0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x1c]
+
+0x1a,0x20,0x64,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x1a,0x20,0x64,0xcc,0x00,0x11,0x42,0x1c]
+
+0x1a,0x40,0x64,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x1a,0x40,0x64,0xcc,0x00,0x11,0x42,0x1c]
+
+0x1a,0x04,0x64,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x1a,0x04,0x64,0xcc,0x00,0x11,0x42,0x1c]
+
+0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x9c]
+
+0x1a,0x02,0x64,0xcc,0x00,0x11,0x42,0x5c
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x1a,0x02,0x64,0xcc,0x00,0x11,0x42,0x5c]
+
+0x1a,0x01,0x64,0xcc,0x00,0x11,0x42,0x3c
+# GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x1a,0x01,0x64,0xcc,0x00,0x11,0x42,0x3c]
+
+0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x61,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x61,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x61,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x61,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x61,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x61,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x02,0x61,0xcc,0x00,0x11,0x42,0x5c
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x61,0xcc,0x00,0x11,0x42,0x5c]
+
+0x10,0x01,0x61,0xcc,0x00,0x11,0x42,0x3c
+# GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x61,0xcc,0x00,0x11,0x42,0x3c]
+
+0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x71,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x71,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x71,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x71,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x71,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x71,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x70,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x70,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x70,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x70,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x70,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x70,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x6f,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x6f,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x6f,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6f,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x6e,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x6e,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x6e,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6e,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x62,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x62,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x62,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x62,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x62,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x62,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x02,0x62,0xcc,0x00,0x11,0x42,0x5c
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x62,0xcc,0x00,0x11,0x42,0x5c]
+
+0x10,0x01,0x62,0xcc,0x00,0x11,0x42,0x3c
+# GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x62,0xcc,0x00,0x11,0x42,0x3c]
+
+0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x60,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x60,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x60,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x60,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x60,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x60,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x02,0x60,0xcc,0x00,0x11,0x42,0x5c
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x60,0xcc,0x00,0x11,0x42,0x5c]
+
+0x10,0x01,0x60,0xcc,0x00,0x11,0x42,0x3c
+# GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x60,0xcc,0x00,0x11,0x42,0x3c]
+
+0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x1b]
+
+0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x9b]
+
+0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x1c
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+
+0x04,0x20,0x5d,0xcc,0x00,0x05,0x12,0x1c
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse ; encoding: [0x04,0x20,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+
+0x04,0x40,0x5d,0xcc,0x00,0x05,0x12,0x1c
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse ; encoding: [0x04,0x40,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+
+0x04,0x04,0x5d,0xcc,0x00,0x05,0x12,0x1c
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x04,0x5d,0xcc,0x00,0x05,0x12,0x1c]
+
+0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x9c
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x9c]
+
+0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x5c
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x5c]
+
+0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x3c
+# GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x3c]
+
+0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x6d,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x6d,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x6d,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6d,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x6c,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x6c,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x6c,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6c,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x6b,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x6b,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x6b,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6b,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x6a,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x6a,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x6a,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6a,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x72,0xcc,0x00,0x11,0x06,0x1a
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1 ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x06,0x1a]
+
+0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x5c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x5c]
+
+0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c
+# GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c]
+
+0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x87,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x87,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x87,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x87,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x87,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x87,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x86,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x86,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x86,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x86,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x86,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x86,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x85,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x85,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x85,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x85,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x85,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x85,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x84,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x84,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x84,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x84,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x84,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x84,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x83,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x83,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x83,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x83,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x83,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x83,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x82,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x82,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x82,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x82,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x82,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x82,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x81,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x81,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x81,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x81,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x81,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x81,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x9c]
+
+0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x1b
+# GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x1b]
+
+0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x9b
+# GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x9b]
+
+0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x20,0x80,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x80,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x40,0x80,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x80,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x04,0x80,0xcc,0x00,0x11,0x42,0x1c
+# GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x80,0xcc,0x00,0x11,0x42,0x1c]
+
+0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x9c
+# GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x9c]
+
+0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x1b
+# GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x1b]
+
+0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x9b
+# GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x9b]
+
+0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x1c
+# GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x1c]
+
+0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x1c
+# GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x1c]
+
+0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x9c
+# GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x9c]
+
+0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c
+# GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
index 802d6368507e2..618e081525414 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt
@@ -11310,18 +11310,6 @@
 # CHECK: v_alignbit_b32 v5, v1, v2, exec_hi      ; encoding: [0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01]
 0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01
 
-# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04
-
-# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04
-
-# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04
-
-# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04
-
 # CHECK: v_alignbyte_b32 v5, v1, v2, v3          ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04]
 0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04
 
@@ -11418,18 +11406,6 @@
 # CHECK: v_alignbyte_b32 v5, v1, v2, exec_hi     ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01]
 0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01
 
-# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04
-
-# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04
-
-# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04
-
-# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04]
-0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04
-
 # CHECK: v_min3_f32 v5, v1, v2, v3               ; encoding: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04]
 0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04
 
diff --git a/llvm/test/MC/ELF/mc-dump.s b/llvm/test/MC/ELF/mc-dump.s
index 9cf15a7ddee46..5cc2e9fa50179 100644
--- a/llvm/test/MC/ELF/mc-dump.s
+++ b/llvm/test/MC/ELF/mc-dump.s
@@ -11,40 +11,46 @@
 # CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:0 Data Size:0 []
 # CHECK-NEXT:  Symbol @0 _start
+# CHECK-NEXT:  Symbol @0  Temporary
 # CHECK-NEXT:0 Org Offset:3 Value:0
-# CHECK-NEXT:3 Relaxable Size:2 <MCInst #2001 <MCOperand Expr:.Ltmp0>>
+# CHECK-NEXT:3 Relaxable Size:0+2 [eb,00] <MCInst #[[#]] <MCOperand Expr:.Ltmp0>>
 # CHECK-NEXT:  Fixup @1 Value:.Ltmp0 Kind:4001
 # CHECK-NEXT:5 Data Size:16 [48,8b,04,25,00,00,00,00,48,8b,04,25,00,00,00,00]
 # CHECK-NEXT:  Fixup @4 Value:f0@<variant 11> Kind:4017
 # CHECK-NEXT:  Fixup @12 Value:_start@<variant 11> Kind:4017
 # CHECK-NEXT:  Symbol @16 .Ltmp0 Temporary
+# CHECK-NEXT:  Symbol @0  Temporary
+# CHECK-NEXT:  Symbol @16  Temporary
 # CHECK-NEXT:MCSection Name:.data
 # CHECK-NEXT:0 Data Size:0 []
 # CHECK-NEXT:  Symbol @0 .data
 # CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4
 # CHECK-NEXT:0 Data Size:4 [01,00,00,00]
 # CHECK-NEXT:4 Fill Value:0 ValueSize:1 NumValues:1
-# CHECK-NEXT:5 LEB Value:.Ltmp0-_start Signed:0
-# CHECK-NEXT:]
+# CHECK-NEXT:5 LEB Size:0+1 [15] Value:.Ltmp0-_start Signed:0
+#      CHECK:]
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t -debug-only=mc-dump -save-temp-labels -g 2>&1 | FileCheck %s --check-prefix=CHECK2
 
 #      CHECK2:5 Data Size:16 [48,8b,04,25,00,00,00,00,48,8b,04,25,00,00,00,00]
 # CHECK2-NEXT:  Fixup @4 Value:f0@<variant 11> Kind:4017
 # CHECK2-NEXT:  Fixup @12 Value:_start@<variant 11> Kind:4017
-# CHECK2-NEXT:  Symbol @16 .Ltmp1
-# CHECK2-NEXT:  Symbol @0 .Ltmp3 Temporary
-# CHECK2-NEXT:  Symbol @8 .Ltmp4 Temporary
-# CHECK2-NEXT:  Symbol @16 .Ltmp5 Temporary
-# CHECK2-NEXT:  Symbol @16 .Lsec_end0 Temporary
+# CHECK2-NEXT:  Symbol @16 .Ltmp2
+# CHECK2-NEXT:  Symbol @0 .Lcfi0 Temporary
+#      CHECK2:MCSection Name:.eh_frame
+#      CHECK2:24 DwarfCallFrame Size:17+1 [00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,45] AddrDelta:.Lcfi0-.Ltmp1
+# CHECK2-NEXT:  Fixup @0 Value:.Ltmp12-.Ltmp11-0 Kind:4003
 
 _start:
 var = _start
+.cfi_startproc
 .org 3
 jmp 1f
+.cfi_offset %rbp, -24
 movq f0@GOTPCREL, %rax
 movq _start@GOTPCREL, %rax
 1:
+.cfi_endproc
 
 .data
 .p2align 2
diff --git a/llvm/test/MC/ELF/reloc-directive.s b/llvm/test/MC/ELF/reloc-directive.s
index 42995aa9e7d81..9871fba2e0021 100644
--- a/llvm/test/MC/ELF/reloc-directive.s
+++ b/llvm/test/MC/ELF/reloc-directive.s
@@ -9,15 +9,18 @@
 # ASM-NEXT:  .reloc .Ltmp1-1, R_X86_64_NONE, foo
 # ASM-NEXT: .Ltmp2:
 # ASM-NEXT:  .reloc 2+.Ltmp2, R_X86_64_NONE, local
-# ASM-NEXT:  .reloc 1+foo+3, R_X86_64_NONE, data+1
-# ASM-NEXT: .Ltmp3:
-# ASM-NEXT:  .reloc .Ltmp3, BFD_RELOC_NONE, unused
 
 # CHECK:      0x2 R_X86_64_NONE foo 0x0
 # CHECK-NEXT: 0x0 R_X86_64_NONE foo 0x0
 # CHECK-NEXT: 0x3 R_X86_64_NONE local 0x0
-# CHECK-NEXT: 0x4 R_X86_64_NONE data 0x1
 # CHECK-NEXT: 0x1 R_X86_64_NONE unused 0x0
+# CHECK-NEXT: 0x4 R_X86_64_NONE data 0x1
+
+# CHECK:      .rela.my {
+# CHECK:        0x0 R_X86_64_NONE foo 0x0
+# CHECK-NEXT:   0x4 R_X86_64_NONE foo 0x0
+# CHECK-NEXT:   0x8 R_X86_64_NONE foo 0x0
+# CHECK-NEXT: }
 
 .text
 .globl foo
@@ -27,17 +30,25 @@ local:
   .reloc .+3-2, R_X86_64_NONE, foo
   .reloc .-1, R_X86_64_NONE, foo
   .reloc 2+., R_X86_64_NONE, local
-  .reloc 1+foo+3, R_X86_64_NONE, data+1
   .reloc ., BFD_RELOC_NONE, unused
+  .space 3
 
 .data
 .globl data
 data:
+  .reloc 1+foo+3, R_X86_64_NONE, data+1
   .long 0
 
-# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=ERR=1 %s 2>&1 | FileCheck %s --check-prefix=ERR
+## Constant offsets are relative to the section start.
+.section .my
+.word 0
+.reloc 0, BFD_RELOC_NONE, foo
+.word 0
+.p2align 3
+.reloc 2+2, BFD_RELOC_NONE, foo
+.p2align 4
+.reloc 8, BFD_RELOC_NONE, foo
 
-.ifdef ERR
 .text
 .globl a, b
 a: ret
@@ -45,22 +56,26 @@ b: ret
 x: ret
 y: ret
 
-# ERR: {{.*}}.s:[[#@LINE+1]]:10: error: expected comma
+# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=PARSE=1 %s 2>&1 | FileCheck %s --check-prefix=PARSE
+# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=ERR=1 %s 2>&1 | FileCheck %s --check-prefix=ERR
+
+.ifdef PARSE
+# PARSE: {{.*}}.s:[[#@LINE+1]]:10: error: expected comma
 .reloc 0 R_X86_64_NONE, a
 
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is negative
+# PARSE: {{.*}}.s:[[#@LINE+1]]:8: error: directional label undefined
+.reloc 1f, R_X86_64_NONE, a
+.endif
+
+.ifdef ERR
 .reloc -1, R_X86_64_NONE, a
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not relocatable
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relocatable
 .reloc 2*., R_X86_64_NONE, a
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not relocatable
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relocatable
 .reloc a+a, R_X86_64_NONE, a
-## GNU as accepts a-a but rejects b-a.
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not representable
-.reloc a-a, R_X86_64_NONE, a
-## TODO GNU as accepts x-x and y-x.
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not representable
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relative to a section
+.reloc b-a, R_X86_64_NONE, a
+# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relative to a section
 .reloc x-x, R_X86_64_NONE, a
 
-# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: directional label undefined
-.reloc 1f, R_X86_64_NONE, a
 .endif
diff --git a/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s b/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s
index 60fd145564ae5..7658865b0f083 100644
--- a/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s
+++ b/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s
@@ -2,6 +2,6 @@
 # RUN: not llvm-mc --filetype=obj --triple=loongarch64 %s -o /dev/null 2>&1 \
 # RUN:			| FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/LoongArch/Relocations/reloc-directive.s b/llvm/test/MC/LoongArch/Relocations/reloc-directive.s
index f900f17c06c39..2fc0c816d7057 100644
--- a/llvm/test/MC/LoongArch/Relocations/reloc-directive.s
+++ b/llvm/test/MC/LoongArch/Relocations/reloc-directive.s
@@ -2,31 +2,23 @@
 # RUN: llvm-mc --filetype=obj --triple=loongarch64 %s \
 # RUN:     | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_LARCH_NONE, .data
-# PRINT: .reloc 4, R_LARCH_NONE, foo+4
-# PRINT: .reloc 0, R_LARCH_NONE, 8
-# PRINT: .reloc 0, R_LARCH_32, .data+2
-# PRINT: .reloc 0, R_LARCH_TLS_DTPMOD32, foo+3
-# PRINT: .reloc 0, R_LARCH_IRELATIVE, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_LARCH_NONE, .data
 
 .text
+  .reloc .+8, R_LARCH_NONE, .data
+  .reloc .+4, R_LARCH_NONE, foo+4
+  .reloc .+0, R_LARCH_NONE, 8
+
+  .reloc .+0, R_LARCH_32, .data+2
+  .reloc .+0, R_LARCH_TLS_DTPMOD32, foo+3
+  .reloc .+0, R_LARCH_IRELATIVE, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   ret
   nop
   nop
-  .reloc 8, R_LARCH_NONE, .data
-  .reloc 4, R_LARCH_NONE, foo+4
-  .reloc 0, R_LARCH_NONE, 8
-
-  .reloc 0, R_LARCH_32, .data+2
-  .reloc 0, R_LARCH_TLS_DTPMOD32, foo+3
-  .reloc 0, R_LARCH_IRELATIVE, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/Mips/nacl-mask.s b/llvm/test/MC/Mips/nacl-mask.s
deleted file mode 100644
index 17226475c2135..0000000000000
--- a/llvm/test/MC/Mips/nacl-mask.s
+++ /dev/null
@@ -1,319 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple=mipsel-unknown-nacl %s \
-# RUN:   | llvm-objdump --no-print-imm-hex -d -z --no-show-raw-insn - | FileCheck %s
-
-# This test tests that address-masking sandboxing is added when given assembly
-# input.
-
-
-# Test that address-masking sandboxing is added before indirect branches and
-# returns.
-
-	.align	4
-test1:
-	.set	noreorder
-
-        jr	$a0
-        nop
-        jr	$ra
-        nop
-
-# CHECK-LABEL:   <test1>:
-
-# CHECK:         and     $4, $4, $14
-# CHECK-NEXT:    jr      $4
-
-# Check that additional nop is inserted, to align mask and jr to the next
-# bundle.
-
-# CHECK-NEXT:    nop
-# CHECK-NEXT:    nop
-
-# CHECK:         and     $ra, $ra, $14
-# CHECK-NEXT:    jr      $ra
-
-
-
-# Test that address-masking sandboxing is added before load instructions.
-
-	.align	4
-test2:
-	.set	noreorder
-
-        lb      $4, 0($1)
-        nop
-        lbu     $4, 0($2)
-        lh      $4, 0($3)
-        lhu     $1, 0($4)
-        lw      $4, 0($5)
-        lwc1    $f0, 0($6)
-        ldc1    $f2, 0($7)
-        ll      $4, 0($8)
-        lwl     $4, 0($9)
-        lwr     $4, 0($10)
-
-        lw      $4, 0($sp)
-        lw      $4, 0($t8)
-
-# CHECK-LABEL:   <test2>:
-
-# CHECK:         and     $1, $1, $15
-# CHECK-NEXT:    lb      $4, 0($1)
-
-# Check that additional nop is inserted, to align mask and load to the next
-# bundle.
-
-# CHECK:         nop
-# CHECK:         nop
-
-# CHECK:         and     $2, $2, $15
-# CHECK-NEXT:    lbu     $4, 0($2)
-
-# CHECK:         and     $3, $3, $15
-# CHECK-NEXT:    lh      $4, 0($3)
-
-# CHECK:         and     $4, $4, $15
-# CHECK-NEXT:    lhu     $1, 0($4)
-
-# CHECK:         and     $5, $5, $15
-# CHECK-NEXT:    lw      $4, 0($5)
-
-# CHECK:         and     $6, $6, $15
-# CHECK-NEXT:    lwc1    $f0, 0($6)
-
-# CHECK:         and     $7, $7, $15
-# CHECK-NEXT:    ldc1    $f2, 0($7)
-
-# CHECK:         and     $8, $8, $15
-# CHECK-NEXT:    ll      $4, 0($8)
-
-# CHECK:         and     $9, $9, $15
-# CHECK-NEXT:    lwl     $4, 0($9)
-
-# CHECK:         and     $10, $10, $15
-# CHECK-NEXT:    lwr     $4, 0($10)
-
-
-# Check that loads where base register is $sp or $t8 (thread pointer register)
-# are not masked.
-
-# CHECK-NOT:     and
-# CHECK:         lw      $4, 0($sp)
-# CHECK-NOT:     and
-# CHECK:         lw      $4, 0($24)
-
-
-
-# Test that address-masking sandboxing is added before store instructions.
-
-	.align	4
-test3:
-	.set	noreorder
-
-        sb      $4, 0($1)
-        nop
-        sh      $4, 0($2)
-        sw      $4, 0($3)
-        swc1    $f0, 0($4)
-        sdc1    $f2, 0($5)
-        swl     $4, 0($6)
-        swr     $4, 0($7)
-        sc      $4, 0($8)
-
-        sw      $4, 0($sp)
-        sw      $4, 0($t8)
-
-# CHECK-LABEL:   <test3>:
-
-# CHECK:         and     $1, $1, $15
-# CHECK-NEXT:    sb      $4, 0($1)
-
-# Check that additional nop is inserted, to align mask and store to the next
-# bundle.
-
-# CHECK:         nop
-# CHECK:         nop
-
-# CHECK:         and     $2, $2, $15
-# CHECK-NEXT:    sh      $4, 0($2)
-
-# CHECK:         and     $3, $3, $15
-# CHECK-NEXT:    sw      $4, 0($3)
-
-# CHECK:         and     $4, $4, $15
-# CHECK-NEXT:    swc1    $f0, 0($4)
-
-# CHECK:         and     $5, $5, $15
-# CHECK-NEXT:    sdc1    $f2, 0($5)
-
-# CHECK:         and     $6, $6, $15
-# CHECK-NEXT:    swl     $4, 0($6)
-
-# CHECK:         and     $7, $7, $15
-# CHECK-NEXT:    swr     $4, 0($7)
-
-# CHECK:         and     $8, $8, $15
-# CHECK-NEXT:    sc      $4, 0($8)
-
-
-# Check that stores where base register is $sp or $t8 (thread pointer register)
-# are not masked.
-
-# CHECK-NOT:     and
-# CHECK:         sw      $4, 0($sp)
-# CHECK-NOT:     and
-# CHECK:         sw      $4, 0($24)
-
-
-
-# Test that address-masking sandboxing is added after instructions that change
-# stack pointer.
-
-	.align	4
-test4:
-	.set	noreorder
-
-        addiu   $sp, $sp, 24
-        nop
-        addu    $sp, $sp, $1
-        lw      $sp, 0($2)
-        lw      $sp, 123($sp)
-        sw      $sp, 123($sp)
-
-# CHECK-LABEL:   <test4>:
-
-# CHECK:         addiu   $sp, $sp, 24
-# CHECK-NEXT:    and     $sp, $sp, $15
-
-# Check that additional nop is inserted, to align instruction and mask to the
-# next bundle.
-
-# CHECK:         nop
-# CHECK:         nop
-
-# CHECK:         addu    $sp, $sp, $1
-# CHECK-NEXT:    and     $sp, $sp, $15
-
-# Since we next check sandboxing sequence which consists of 3 instructions,
-# check that 2 additional nops are inserted, to align it to the next bundle.
-
-# CHECK:         nop
-# CHECK:         nop
-
-
-# Check that for instructions that change stack-pointer and load from memory
-# masks are added before and after the instruction.
-
-# CHECK:         and     $2, $2, $15
-# CHECK-NEXT:    lw      $sp, 0($2)
-# CHECK-NEXT:    and     $sp, $sp, $15
-
-# For loads where $sp is destination and base, check that mask is added after
-# but not before.
-
-# CHECK-NOT:     and
-# CHECK:         lw      $sp, 123($sp)
-# CHECK-NEXT:    and     $sp, $sp, $15
-
-# For stores where $sp is destination and base, check that mask is added neither
-# before nor after.
-
-# CHECK-NOT:     and
-# CHECK:         sw      $sp, 123($sp)
-# CHECK-NOT:     and
-
-
-
-# Test that call + branch delay is aligned at bundle end.  Test that mask is
-# added before indirect calls.
-
-	.align	4
-test5:
-	.set	noreorder
-
-        jal func1
-        addiu $4, $zero, 1
-
-        nop
-        bal func2
-        addiu $4, $zero, 2
-
-        nop
-        nop
-        bltzal $t1, func3
-        addiu $4, $zero, 3
-
-        nop
-        nop
-        nop
-        bgezal $t2, func4
-        addiu $4, $zero, 4
-
-        jalr $t9
-        addiu $4, $zero, 5
-
-
-# CHECK:             nop
-# CHECK-NEXT:        nop
-# CHECK-LABEL:       <test5>:
-# CHECK-NEXT:        jal
-# CHECK-NEXT:        addiu   $4, $zero, 1
-
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        bal
-# CHECK-NEXT:        addiu   $4, $zero, 2
-
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        bltzal
-# CHECK-NEXT:        addiu   $4, $zero, 3
-
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
-
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        bgezal
-# CHECK-NEXT:        addiu   $4, $zero, 4
-
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        and     $25, $25, $14
-# CHECK-NEXT:        jalr    $25
-# CHECK-NEXT:        addiu   $4, $zero, 5
-
-
-
-# Test that we can put non-dangerous loads and stores in branch delay slot.
-
-	.align	4
-test6:
-	.set	noreorder
-
-        jal func1
-        sw      $4, 0($sp)
-
-        bal func2
-        lw      $5, 0($t8)
-
-        jalr $t9
-        sw      $sp, 0($sp)
-
-
-
-# CHECK:             nop
-# CHECK-NEXT:        nop
-# CHECK-LABEL:       <test6>:
-# CHECK-NEXT:        jal
-# CHECK-NEXT:        sw      $4, 0($sp)
-
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        bal
-# CHECK-NEXT:        lw      $5, 0($24)
-
-# CHECK-NEXT:        nop
-# CHECK-NEXT:        and     $25, $25, $14
-# CHECK-NEXT:        jalr
-# CHECK-NEXT:        sw      $sp, 0($sp)
diff --git a/llvm/test/MC/Mips/reloc-directive-bad-obj.s b/llvm/test/MC/Mips/reloc-directive-bad-obj.s
index 86d6d0cc66c57..74e5dae5264f6 100644
--- a/llvm/test/MC/Mips/reloc-directive-bad-obj.s
+++ b/llvm/test/MC/Mips/reloc-directive-bad-obj.s
@@ -2,8 +2,8 @@
 # RUN:     -target-abi=o32 -filetype=obj -o /dev/null 2>&1 | FileCheck %s
 .text
 nop
-.reloc foo, R_MIPS_32, .text  # CHECK: :[[@LINE]]:24: error: unresolved relocation offset
+.reloc foo, R_MIPS_32, .text  # CHECK: :[[@LINE]]:8: error: .reloc offset is not relative to a section
 nop
 nop
-.reloc bar, R_MIPS_32, .text  # CHECK: :[[@LINE]]:24: error: unresolved relocation offset
+.reloc bar, R_MIPS_32, .text  # CHECK: :[[@LINE]]:8: error: .reloc offset is not relative to a section
 nop
diff --git a/llvm/test/MC/Mips/reloc-directive-bad.s b/llvm/test/MC/Mips/reloc-directive-bad.s
index bb056b752fb9f..f09a73b962b01 100644
--- a/llvm/test/MC/Mips/reloc-directive-bad.s
+++ b/llvm/test/MC/Mips/reloc-directive-bad.s
@@ -2,6 +2,6 @@
 # RUN:     -target-abi=o32 2>&1 | FileCheck %s
 	.text
 foo:
-	.reloc 0, R_MIPS_32, .text+.text  # CHECK: :[[@LINE]]:23: error: expression must be relocatable
-	.reloc 0, 0, R_MIPS_32, .text     # CHECK: :[[@LINE]]:12: error: expected relocation name
+	.reloc ., R_MIPS_32, .text+.text  # CHECK: :[[@LINE]]:23: error: expression must be relocatable
+	.reloc ., 0, R_MIPS_32, .text     # CHECK: :[[@LINE]]:12: error: expected relocation name
 	nop
diff --git a/llvm/test/MC/Mips/reloc-directive-label-offset.s b/llvm/test/MC/Mips/reloc-directive-label-offset.s
index 257bfeb10d151..279fc7860dcea 100644
--- a/llvm/test/MC/Mips/reloc-directive-label-offset.s
+++ b/llvm/test/MC/Mips/reloc-directive-label-offset.s
@@ -58,18 +58,18 @@ bar:
 
 # OBJ-N32-LABEL: Relocations [
 # OBJ-N32:           0x4 R_MIPS_NONE .text
-# OBJ-N32-NEXT:      0x1C R_MIPS_GOT_OFST .text
-# OBJ-N32-NEXT:      0x0 R_MIPS_32 .text
 # OBJ-N32-NEXT:      0xC R_MIPS_32 .text
 # OBJ-N32-NEXT:      0x10 R_MIPS_CALL16 foo
 # OBJ-N32-NEXT:      0x20 R_MIPS_GOT_DISP foo
 # OBJ-N32-NEXT:      0x24 R_MIPS_GOT_PAGE .text
+# OBJ-N32-NEXT:      0x1C R_MIPS_GOT_OFST .text
+# OBJ-N32-NEXT:      0x0 R_MIPS_32 .text
 
 # OBJ-N64-LABEL: Relocations [
 # OBJ-N64:           0x4 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE .text 0x0
-# OBJ-N64-NEXT:      0x1C R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .text 0x0
-# OBJ-N64-NEXT:      0x0 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
 # OBJ-N64-NEXT:      0xC R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
 # OBJ-N64-NEXT:      0x10 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE foo 0x0
 # OBJ-N64-NEXT:      0x20 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE foo 0x0
 # OBJ-N64-NEXT:      0x24 R_MIPS_GOT_PAGE/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0x1C R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .text 0x0
+# OBJ-N64-NEXT:      0x0 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0
diff --git a/llvm/test/MC/Mips/reloc-directive.s b/llvm/test/MC/Mips/reloc-directive.s
index 2f699ec98a609..4f875687f33b7 100644
--- a/llvm/test/MC/Mips/reloc-directive.s
+++ b/llvm/test/MC/Mips/reloc-directive.s
@@ -15,78 +15,79 @@
 # RUN:     FileCheck -check-prefix=OBJ-N64 %s
 	.text
 foo:
-	.reloc 4, R_MIPS_NONE, foo   # ASM: .reloc 4, R_MIPS_NONE, foo
-	.reloc 0, R_MIPS_NONE, foo+4 # ASM: .reloc 0, R_MIPS_NONE, foo+4
-	.reloc 8, R_MIPS_32, foo+8   # ASM: .reloc 8, R_MIPS_32, foo+8
+# ASM: .reloc {{.*}}+4, R_MIPS_NONE, foo
+	.reloc .+4, R_MIPS_NONE, foo
+	.reloc .+0, R_MIPS_NONE, foo+4
+	.reloc .+8, R_MIPS_32, foo+8
 	nop
 	nop
 	nop
-	.reloc 12, R_MIPS_NONE       # ASM: .reloc 12, R_MIPS_NONE{{$}}
+	.reloc ., R_MIPS_NONE
         nop
-  .reloc 16, R_MIPS_CALL_HI16, 4        # ASM: .reloc 16, R_MIPS_CALL_HI16, 4
+  .reloc ., R_MIPS_CALL_HI16, 4
   nop
-  .reloc 20, R_MIPS_CALL_LO16, 4        # ASM: .reloc 20, R_MIPS_CALL_LO16, 4
+  .reloc ., R_MIPS_CALL_LO16, 4
   nop
-  .reloc 24, R_MIPS_CALL16, 4           # ASM: .reloc 24, R_MIPS_CALL16, 4
+  .reloc ., R_MIPS_CALL16, 4
   nop
-  .reloc 28, R_MIPS_GOT16, 4            # ASM: .reloc 28, R_MIPS_GOT16, 4
+  .reloc ., R_MIPS_GOT16, 4
   nop
-  .reloc 32, R_MIPS_GOT_PAGE, 4         # ASM: .reloc 32, R_MIPS_GOT_PAGE, 4
+  .reloc ., R_MIPS_GOT_PAGE, 4
   nop
-  .reloc 36, R_MIPS_GOT_OFST, 4         # ASM: .reloc 36, R_MIPS_GOT_OFST, 4
+  .reloc ., R_MIPS_GOT_OFST, 4
   nop
-  .reloc 40, R_MIPS_GOT_DISP, 4         # ASM: .reloc 40, R_MIPS_GOT_DISP, 4
+  .reloc ., R_MIPS_GOT_DISP, 4
   nop
-  .reloc 44, R_MIPS_GOT_HI16, 4         # ASM: .reloc 44, R_MIPS_GOT_HI16, 4
+  .reloc ., R_MIPS_GOT_HI16, 4
   nop
-  .reloc 48, R_MIPS_GOT_LO16, 4         # ASM: .reloc 48, R_MIPS_GOT_LO16, 4
+  .reloc ., R_MIPS_GOT_LO16, 4
   nop
-  .reloc 52, R_MIPS_TLS_GOTTPREL, 4     # ASM: .reloc 52, R_MIPS_TLS_GOTTPREL, 4
+  .reloc ., R_MIPS_TLS_GOTTPREL, 4
   nop
-  .reloc 56, R_MIPS_TLS_DTPREL_HI16, 4  # ASM: .reloc 56, R_MIPS_TLS_DTPREL_HI16, 4
+  .reloc ., R_MIPS_TLS_DTPREL_HI16, 4
   nop
-  .reloc 60, R_MIPS_TLS_DTPREL_LO16, 4  # ASM: .reloc 60, R_MIPS_TLS_DTPREL_LO16, 4
+  .reloc ., R_MIPS_TLS_DTPREL_LO16, 4
   nop
-  .reloc 64, R_MIPS_TLS_GD, 4           # ASM: .reloc 64, R_MIPS_TLS_GD, 4
+  .reloc ., R_MIPS_TLS_GD, 4
   nop
-  .reloc 68, R_MIPS_TLS_LDM, 4          # ASM: .reloc 68, R_MIPS_TLS_LDM, 4
+  .reloc ., R_MIPS_TLS_LDM, 4
   nop
-  .reloc 72, R_MIPS_TLS_TPREL_HI16, 4   # ASM: .reloc 72, R_MIPS_TLS_TPREL_HI16, 4
+  .reloc ., R_MIPS_TLS_TPREL_HI16, 4
   nop
-  .reloc 76, R_MIPS_TLS_TPREL_LO16, 4   # ASM: .reloc 76, R_MIPS_TLS_TPREL_LO16, 4
+  .reloc ., R_MIPS_TLS_TPREL_LO16, 4
   nop
-  .reloc 80, R_MICROMIPS_CALL16, 4      # ASM: .reloc 80, R_MICROMIPS_CALL16, 4
+  .reloc ., R_MICROMIPS_CALL16, 4
   nop
-  .reloc 84, R_MICROMIPS_GOT_DISP, 4    # ASM: .reloc 84, R_MICROMIPS_GOT_DISP, 4
+  .reloc ., R_MICROMIPS_GOT_DISP, 4
   nop
-  .reloc 88, R_MICROMIPS_GOT_PAGE, 4    # ASM: .reloc 88, R_MICROMIPS_GOT_PAGE, 4
+  .reloc ., R_MICROMIPS_GOT_PAGE, 4
   nop
-  .reloc 92, R_MICROMIPS_GOT_OFST, 4    # ASM: .reloc 92, R_MICROMIPS_GOT_OFST, 4
+  .reloc ., R_MICROMIPS_GOT_OFST, 4
   nop
-  .reloc 96, R_MICROMIPS_GOT16, 4       # ASM: .reloc 96, R_MICROMIPS_GOT16, 4
+  .reloc ., R_MICROMIPS_GOT16, 4
   nop
-  .reloc 100, R_MICROMIPS_TLS_GOTTPREL, 4       # ASM: .reloc 100, R_MICROMIPS_TLS_GOTTPREL, 4
+  .reloc ., R_MICROMIPS_TLS_GOTTPREL, 4
   nop
-  .reloc 104, R_MICROMIPS_TLS_DTPREL_HI16, 4    # ASM: .reloc 104, R_MICROMIPS_TLS_DTPREL_HI16, 4
+  .reloc ., R_MICROMIPS_TLS_DTPREL_HI16, 4
   nop
-  .reloc 108, R_MICROMIPS_TLS_DTPREL_LO16, 4    # ASM: .reloc 108, R_MICROMIPS_TLS_DTPREL_LO16, 4
+  .reloc ., R_MICROMIPS_TLS_DTPREL_LO16, 4
   nop
-  .reloc 112, R_MICROMIPS_TLS_GD, 4             # ASM: .reloc 112, R_MICROMIPS_TLS_GD, 4
+  .reloc ., R_MICROMIPS_TLS_GD, 4
   nop
-  .reloc 116, R_MICROMIPS_TLS_LDM, 4            # ASM: .reloc 116, R_MICROMIPS_TLS_LDM, 4
+  .reloc ., R_MICROMIPS_TLS_LDM, 4
   nop
-  .reloc 120, R_MICROMIPS_TLS_TPREL_HI16, 4     # ASM: .reloc 120, R_MICROMIPS_TLS_TPREL_HI16, 4
+  .reloc ., R_MICROMIPS_TLS_TPREL_HI16, 4
   nop
-  .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4     # ASM: .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4
+  .reloc ., R_MICROMIPS_TLS_TPREL_LO16, 4
   nop
-  .reloc 128, R_MIPS_JALR, 4            # ASM: .reloc 128, R_MIPS_JALR, 4
+  .reloc ., R_MIPS_JALR, 4
   nop
-  .reloc 132, R_MICROMIPS_JALR, 4       # ASM: .reloc 132, R_MICROMIPS_JALR, 4
+  .reloc ., R_MICROMIPS_JALR, 4
   nop
-  .reloc 136, BFD_RELOC_NONE, 9         # ASM: .reloc 136, BFD_RELOC_NONE, 9
-  .reloc 137, BFD_RELOC_16, 9           # ASM: .reloc 137, BFD_RELOC_16, 9
-  .reloc 138, BFD_RELOC_32, 9           # ASM: .reloc 138, BFD_RELOC_32, 9
-  .reloc 139, BFD_RELOC_64, 9           # ASM: .reloc 139, BFD_RELOC_64, 9
+  .reloc ., BFD_RELOC_NONE, 9
+  .reloc ., BFD_RELOC_16, 9
+  .reloc ., BFD_RELOC_32, 9
+  .reloc ., BFD_RELOC_64, 9
   nop
 
 # OBJ-O32-LABEL: Name: .text
@@ -134,9 +135,9 @@ foo:
 # OBJ-O32-NEXT:  0x80 R_MIPS_JALR -
 # OBJ-O32-NEXT:  0x84 R_MICROMIPS_JALR -
 # OBJ-O32-NEXT:  0x88 R_MIPS_NONE -
-# OBJ-O32-NEXT:  0x89 R_MIPS_16 -
-# OBJ-O32-NEXT:  0x8A R_MIPS_32 -
-# OBJ-O32-NEXT:  0x8B R_MIPS_64 -
+# OBJ-O32-NEXT:  0x88 R_MIPS_16 -
+# OBJ-O32-NEXT:  0x88 R_MIPS_32 -
+# OBJ-O32-NEXT:  0x88 R_MIPS_64 -
 # OBJ-O32-NEXT:  0x1C R_MIPS_GOT16 -
 # OBJ-O32-NEXT:  0x60 R_MICROMIPS_GOT16 -
 
@@ -188,9 +189,9 @@ foo:
 # OBJ-N32-NEXT:  0x80 R_MIPS_JALR - 0x4
 # OBJ-N32-NEXT:  0x84 R_MICROMIPS_JALR - 0x4
 # OBJ-N32-NEXT:  0x88 R_MIPS_NONE - 0x9
-# OBJ-N32-NEXT:  0x89 R_MIPS_16 - 0x9
-# OBJ-N32-NEXT:  0x8A R_MIPS_32 - 0x9
-# OBJ-N32-NEXT:  0x8B R_MIPS_64 - 0x9
+# OBJ-N32-NEXT:  0x88 R_MIPS_16 - 0x9
+# OBJ-N32-NEXT:  0x88 R_MIPS_32 - 0x9
+# OBJ-N32-NEXT:  0x88 R_MIPS_64 - 0x9
 
 # OBJ-N64-LABEL: Name: .text
 # OBJ-N64:       0000: 00000000 00000000 00000000 00000000
@@ -239,6 +240,6 @@ foo:
 # OBJ-N64-NEXT:  0x80 R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4
 # OBJ-N64-NEXT:  0x84 R_MICROMIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4
 # OBJ-N64-NEXT:  0x88 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE - 0x9
-# OBJ-N64-NEXT:  0x89 R_MIPS_16/R_MIPS_NONE/R_MIPS_NONE - 0x9
-# OBJ-N64-NEXT:  0x8A R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE - 0x9
-# OBJ-N64-NEXT:  0x8B R_MIPS_64/R_MIPS_NONE/R_MIPS_NONE - 0x9
+# OBJ-N64-NEXT:  0x88 R_MIPS_16/R_MIPS_NONE/R_MIPS_NONE - 0x9
+# OBJ-N64-NEXT:  0x88 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE - 0x9
+# OBJ-N64-NEXT:  0x88 R_MIPS_64/R_MIPS_NONE/R_MIPS_NONE - 0x9
diff --git a/llvm/test/MC/PowerPC/ppc32-reloc-directive.s b/llvm/test/MC/PowerPC/ppc32-reloc-directive.s
index 3eb6c2964c85c..ca809a750a373 100644
--- a/llvm/test/MC/PowerPC/ppc32-reloc-directive.s
+++ b/llvm/test/MC/PowerPC/ppc32-reloc-directive.s
@@ -2,15 +2,15 @@
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc-linux-musl %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_PPC_NONE, .data
-# PRINT: .reloc 4, R_PPC_NONE, foo+4
-# PRINT: .reloc 0, R_PPC_NONE, 8
-# PRINT: .reloc 0, R_PPC_ADDR32, .data+2
-# PRINT: .reloc 0, R_PPC_REL16_HI, foo+3
-# PRINT: .reloc 0, R_PPC_REL16_HA, 5
-# PRINT: .reloc 0, BFD_RELOC_NONE, 9
-# PRINT: .reloc 0, BFD_RELOC_16, 9
-# PRINT: .reloc 0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+8, R_PPC_NONE, .data
+# PRINT: .reloc {{.*}}+4, R_PPC_NONE, foo+4
+# PRINT: .reloc {{.*}}+0, R_PPC_NONE, 8
+# PRINT: .reloc {{.*}}+0, R_PPC_ADDR32, .data+2
+# PRINT: .reloc {{.*}}+0, R_PPC_REL16_HI, foo+3
+# PRINT: .reloc {{.*}}+0, R_PPC_REL16_HA, 5
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9
 
 # CHECK:      0x8 R_PPC_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_PPC_NONE foo 0x4
@@ -23,19 +23,19 @@
 # CHECK-NEXT: 0x0 R_PPC_ADDR32 - 0x9
 
 .text
+  .reloc .+8, R_PPC_NONE, .data
+  .reloc .+4, R_PPC_NONE, foo+4
+  .reloc .+0, R_PPC_NONE, 8
+  .reloc .+0, R_PPC_ADDR32, .data+2
+  .reloc .+0, R_PPC_REL16_HI, foo+3
+  .reloc .+0, R_PPC_REL16_HA, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   blr
   nop
   nop
-  .reloc 8, R_PPC_NONE, .data
-  .reloc 4, R_PPC_NONE, foo+4
-  .reloc 0, R_PPC_NONE, 8
-  .reloc 0, R_PPC_ADDR32, .data+2
-  .reloc 0, R_PPC_REL16_HI, foo+3
-  .reloc 0, R_PPC_REL16_HA, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/PowerPC/ppc64-reloc-directive.s b/llvm/test/MC/PowerPC/ppc64-reloc-directive.s
index 5f54ac73bcf16..2268a3c18bf97 100644
--- a/llvm/test/MC/PowerPC/ppc64-reloc-directive.s
+++ b/llvm/test/MC/PowerPC/ppc64-reloc-directive.s
@@ -4,16 +4,16 @@
 # RUN: llvm-mc -filetype=obj -triple=powerpc64-linux-musl %s | llvm-readobj -r - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=powerpc64le-linux-musl %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_PPC64_NONE, .data
-# PRINT: .reloc 4, R_PPC64_NONE, foo+4
-# PRINT: .reloc 0, R_PPC64_NONE, 8
-# PRINT: .reloc 0, R_PPC64_ADDR32, .data+2
-# PRINT: .reloc 0, R_PPC64_REL16_HI, foo+3
-# PRINT: .reloc 0, R_PPC64_REL16_HA, 5
-# PRINT: .reloc 0, BFD_RELOC_NONE, 9
-# PRINT: .reloc 0, BFD_RELOC_16, 9
-# PRINT: .reloc 0, BFD_RELOC_32, 9
-# PRINT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_PPC64_NONE, .data
+# PRINT: .reloc {{.*}}+4, R_PPC64_NONE, foo+4
+# PRINT: .reloc {{.*}}+0, R_PPC64_NONE, 8
+# PRINT: .reloc {{.*}}+0, R_PPC64_ADDR32, .data+2
+# PRINT: .reloc {{.*}}+0, R_PPC64_REL16_HI, foo+3
+# PRINT: .reloc {{.*}}+0, R_PPC64_REL16_HA, 5
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9
+# PRINT: .reloc {{.*}}+0, BFD_RELOC_64, 9
 
 # CHECK:      0x8 R_PPC64_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_PPC64_NONE foo 0x4
@@ -27,20 +27,20 @@
 # CHECK-NEXT: 0x0 R_PPC64_ADDR64 - 0x9
 
 .text
+  .reloc .+8, R_PPC64_NONE, .data
+  .reloc .+4, R_PPC64_NONE, foo+4
+  .reloc .+0, R_PPC64_NONE, 8
+  .reloc .+0, R_PPC64_ADDR32, .data+2
+  .reloc .+0, R_PPC64_REL16_HI, foo+3
+  .reloc .+0, R_PPC64_REL16_HA, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   blr
   nop
   nop
-  .reloc 8, R_PPC64_NONE, .data
-  .reloc 4, R_PPC64_NONE, foo+4
-  .reloc 0, R_PPC64_NONE, 8
-  .reloc 0, R_PPC64_ADDR32, .data+2
-  .reloc 0, R_PPC64_REL16_HI, foo+3
-  .reloc 0, R_PPC64_REL16_HA, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s
index d1392c20e3ade..24f3e67ebbdda 100644
--- a/llvm/test/MC/RISCV/Relocations/mc-dump.s
+++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s
@@ -7,7 +7,7 @@
 # CHECK-NEXT:  Symbol @0 .text
 # CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00]
-# CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4022
+# CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4023
 # CHECK-NEXT:  Symbol @0 $x
 # CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
 # CHECK-NEXT:12 Data Size:4 [13,05,30,00]
diff --git a/llvm/test/MC/RISCV/reloc-directive-err.s b/llvm/test/MC/RISCV/reloc-directive-err.s
index 2b00019fcb8ea..370e4ceb95734 100644
--- a/llvm/test/MC/RISCV/reloc-directive-err.s
+++ b/llvm/test/MC/RISCV/reloc-directive-err.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=riscv64 %s 2>&1 | FileCheck --check-prefix=PRINT %s
 # RUN: not llvm-mc -filetype=obj -triple=riscv64 %s -o /dev/null 2>&1 | FileCheck %s
 
-# PRINT: .reloc 0, R_INVALID, 0
+# PRINT: .reloc {{.*}}, R_INVALID, 0
 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name
-.reloc 0, R_INVALID, 0
+.reloc ., R_INVALID, 0
diff --git a/llvm/test/MC/RISCV/reloc-directive.s b/llvm/test/MC/RISCV/reloc-directive.s
index 0e217fa798482..4ab2889a17ac9 100644
--- a/llvm/test/MC/RISCV/reloc-directive.s
+++ b/llvm/test/MC/RISCV/reloc-directive.s
@@ -3,15 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=riscv32 %s | llvm-readobj -r - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=riscv64 %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_RISCV_NONE, .data
-# PRINT: .reloc 4, R_RISCV_NONE, foo+4
-# PRINT: .reloc 0, R_RISCV_NONE, 8
-# PRINT: .reloc 0, R_RISCV_32, .data+2
-# PRINT: .reloc 0, R_RISCV_SET32, foo+3
-# PRINT: .reloc 0, R_RISCV_32_PCREL, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT: .reloc {{.*}}+8, R_RISCV_NONE, .data
 
 # CHECK:      0x8 R_RISCV_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_RISCV_NONE foo 0x4
@@ -37,26 +29,26 @@
 # CHECK-NEXT: }
 
 .text
-  ret
-  nop
-  nop
-  .reloc 8, R_RISCV_NONE, .data
-  .reloc 4, R_RISCV_NONE, foo+4
-  .reloc 0, R_RISCV_NONE, 8
+  .reloc .+8, R_RISCV_NONE, .data
+  .reloc .+4, R_RISCV_NONE, foo+4
+  .reloc .+0, R_RISCV_NONE, 8
 
-  .reloc 0, R_RISCV_32, .data+2
-  .reloc 0, R_RISCV_SET32, foo+3
-  .reloc 0, R_RISCV_32_PCREL, 5
+  .reloc .+0, R_RISCV_32, .data+2
+  .reloc .+0, R_RISCV_SET32, foo+3
+  .reloc .+0, R_RISCV_32_PCREL, 5
 
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
 
   .reloc foo, R_RISCV_32, 6
   .reloc line, R_RISCV_32, 6
   .reloc probe, R_RISCV_32, 6
-
   .reloc foo+4, R_RISCV_32, 6
+  ret
+  nop
+  nop
+
 .data
 .globl foo
 foo:
diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s
index c755acddc712e..c259c142f92b2 100644
--- a/llvm/test/MC/RISCV/rv32p-valid.s
+++ b/llvm/test/MC/RISCV/rv32p-valid.s
@@ -2,7 +2,7 @@
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-p < %s \
 # RUN:     | llvm-objdump --mattr=+experimental-p -M no-aliases -d -r --no-print-imm-hex - \
-# RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ,CHECK-OBJ %s
+# RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 
 # CHECK-ASM-AND-OBJ: clz a0, a1
 # CHECK-ASM: encoding: [0x13,0x95,0x05,0x60]
@@ -73,7 +73,6 @@ psabs.b t0, t1
 # CHECK-ASM-AND-OBJ: plui.h gp, 32
 # CHECK-ASM: encoding: [0x9b,0x21,0x20,0xf0]
 plui.h gp, 32
-# CHECK-OBJ: plui.h gp, -412
-# CHECK-ASM: plui.h gp, 612
+# CHECK-ASM-AND-OBJ: plui.h gp, -412
 # CHECK-ASM: encoding: [0x9b,0xa1,0x64,0xf0]
 plui.h gp, 612
diff --git a/llvm/test/MC/RISCV/rv64p-valid.s b/llvm/test/MC/RISCV/rv64p-valid.s
index 6c48ad3469b47..3ea6b00bbe11c 100644
--- a/llvm/test/MC/RISCV/rv64p-valid.s
+++ b/llvm/test/MC/RISCV/rv64p-valid.s
@@ -2,7 +2,7 @@
 # RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
 # RUN: llvm-mc -filetype=obj --triple=riscv64 -mattr=+experimental-p < %s \
 # RUN:     | llvm-objdump --triple=riscv64 --mattr=+experimental-p -M no-aliases --no-print-imm-hex -d -r - \
-# RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ,CHECK-OBJ %s
+# RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ %s
 
 # CHECK-ASM-AND-OBJ: clz a0, a1
 # CHECK-ASM: encoding: [0x13,0x95,0x05,0x60]
@@ -97,14 +97,12 @@ psabs.b a0, s2
 # CHECK-ASM-AND-OBJ: plui.h s2, 4
 # CHECK-ASM: encoding: [0x1b,0x29,0x04,0xf0]
 plui.h s2, 4
-# CHECK-OBJ: plui.h gp, -412
-# CHECK-ASM: plui.h gp, 612
+# CHECK-ASM-AND-OBJ: plui.h gp, -412
 # CHECK-ASM: encoding: [0x9b,0xa1,0x64,0xf0]
 plui.h gp, 612
 # CHECK-ASM-AND-OBJ: plui.w a2, 1
 # CHECK-ASM: encoding: [0x1b,0x26,0x01,0xf2]
 plui.w a2, 1
-# CHECK-OBJ: plui.w a2, -1
-# CHECK-ASM: plui.w a2, 1023
+# CHECK-ASM-AND-OBJ: plui.w a2, -1
 # CHECK-ASM: encoding: [0x1b,0xa6,0xff,0xf3]
 plui.w a2, 1023
diff --git a/llvm/test/MC/RISCV/xqci-fixups.s b/llvm/test/MC/RISCV/xqci-fixups.s
new file mode 100644
index 0000000000000..410126d9fd857
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqci-fixups.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc -filetype=obj -triple riscv32  < %s \
+# RUN:     --mattr=+experimental-xqcili,+experimental-xqcilb,+experimental-xqcibi \
+# RUN:     -riscv-add-build-attributes \
+# RUN:     | llvm-objdump --no-print-imm-hex -M no-aliases -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INSTR %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 %s \
+# RUN:     --mattr=+experimental-xqcili,+experimental-xqcilb,+experimental-xqcibi \
+# RUN:     | llvm-readobj -r - | FileCheck %s -check-prefix=CHECK-REL
+
+## This checks that, if the assembler can resolve the qc fixup, that the fixup
+## is applied correctly to the instruction.
+
+.L0:
+# CHECK-INSTR: qc.e.beqi a0, 64, 0x0
+qc.e.beqi a0, 64, .L0
+# CHECK-INSTR: qc.e.j 0x10000016
+qc.e.j func
+# CHECK-INSTR: qc.e.li a0, 8
+qc.e.li a0, abs_sym
+# CHECK-INSTR: qc.li a0, 8
+qc.li a0, %qc.abs20(abs_sym)
+
+
+
+# This has to come after the instructions that use it or it will
+# be evaluated at parse-time (avoiding fixups)
+abs_sym = 8
+
+
+.space 0x10000000
+func:
+  ret
+
+## All these fixups should be resolved by the assembler without emitting
+## relocations.
+# CHECK-REL-NOT: R_RISCV
diff --git a/llvm/test/MC/RISCV/xqcibi-relocations.s b/llvm/test/MC/RISCV/xqcibi-relocations.s
index 0f7cc8c5787a1..931cd7c9314bb 100644
--- a/llvm/test/MC/RISCV/xqcibi-relocations.s
+++ b/llvm/test/MC/RISCV/xqcibi-relocations.s
@@ -84,7 +84,20 @@ qc.bnei t3, 14, same_section
 # OBJ-NEXT: qc.e.bgeui s2, 0x18, 0x28 <same_section>
 qc.e.bgeui s2, 24, same_section
 
-.option norelax
+## Enable compression/relaxation to check how symbols are handled.
+.option noexact
+
+# ASM: qc.bnei t1, 10, undef
+# OBJ: qc.beqi t1, 0xa, 0x42 <same_section_extern+0x16>
+# OBJ-NEXT: j 0x3e <same_section_extern+0x12>
+# OBJ-NEXT: R_RISCV_JAL undef{{$}}
+qc.bnei t1, 10, undef
+
+# ASM: qc.e.bgeui s0, 40, undef
+# OBJ-NEXT: qc.e.bltui s0, 0x28, 0x4c <same_section_extern+0x20>
+# OBJ-NEXT: j 0x48 <same_section_extern+0x1c>
+# OBJ-NEXT: R_RISCV_JAL undef{{$}}
+qc.e.bgeui s0, 40, undef
 
 .section .text.second, "ax", @progbits
 
diff --git a/llvm/test/MC/RISCV/xqcilb-relocations.s b/llvm/test/MC/RISCV/xqcilb-relocations.s
index ab08beed9be94..48c8c6931c8af 100644
--- a/llvm/test/MC/RISCV/xqcilb-relocations.s
+++ b/llvm/test/MC/RISCV/xqcilb-relocations.s
@@ -92,7 +92,22 @@ qc.e.j same_section
 # OBJ-NEXT: R_RISCV_RELAX
 qc.e.jal same_section
 
-.option norelax
+## Enable compression/relaxation to check how symbols are handled.
+.option noexact
+
+qc.e.j undef
+# ASM: j undef
+# OBJ: qc.e.j 0x44 <same_section_extern+0x10>
+# OBJ-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM195 undef{{$}}
+# OBJ-NEXT: R_RISCV_RELAX
+
+qc.e.jal undef
+# ASM: jal undef
+# OBJ: qc.e.jal 0x4a <same_section_extern+0x16>
+# OBJ-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM195 undef{{$}}
+# OBJ-NEXT: R_RISCV_RELAX
 
 .section .text.other, "ax", @progbits
 
diff --git a/llvm/test/MC/RISCV/xqcili-relocations.s b/llvm/test/MC/RISCV/xqcili-relocations.s
index 866586236fa46..7eff61fc782d8 100644
--- a/llvm/test/MC/RISCV/xqcili-relocations.s
+++ b/llvm/test/MC/RISCV/xqcili-relocations.s
@@ -97,7 +97,22 @@ qc.li a1, %qc.abs20(undef)
 # OBJ-NEXT: R_RISCV_RELAX
 qc.e.li s1, undef
 
-.option norelax
+## Enable compression/relaxation to check how symbols are handled.
+.option noexact
+
+# ASM: qc.li a1, %qc.abs20(undef)
+# OBJ-NEXT: qc.li a1, 0x0
+# OBJ-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM192 undef{{$}}
+# OBJ-NEXT: R_RISCV_RELAX
+qc.li a1, %qc.abs20(undef)
+
+# ASM: qc.e.li a2, undef
+# OBJ-NEXT: qc.e.li a2, 0x0
+# OBJ-NEXT: R_RISCV_VENDOR QUALCOMM{{$}}
+# OBJ-NEXT: R_RISCV_CUSTOM194 undef{{$}}
+# OBJ-NEXT: R_RISCV_RELAX
+qc.e.li a2, undef
 
 .section .text.other, "ax", @progbits
 
diff --git a/llvm/test/MC/RISCV/xqcilia-valid.s b/llvm/test/MC/RISCV/xqcilia-valid.s
index 872ffbe8126aa..2396271d7db69 100644
--- a/llvm/test/MC/RISCV/xqcilia-valid.s
+++ b/llvm/test/MC/RISCV/xqcilia-valid.s
@@ -123,3 +123,43 @@ qc.e.addai x2, 48
 # CHECK-NOALIAS: c.andi s1, -1
 # CHECK-ENC: encoding: [0xfd,0x98]
 qc.e.andai x9, 4294967295
+
+# CHECK-ALIAS: addi t0, t2, 400
+# CHECK-NOALIAS: addi t0, t2, 400
+# CHECK-ENC: encoding: [0x93,0x82,0x03,0x19]
+qc.e.addi x5, x7, 400
+
+# CHECK-ALIAS: andi t0, t2, 750
+# CHECK-NOALIAS: andi t0, t2, 750
+# CHECK-ENC: encoding: [0x93,0xf2,0xe3,0x2e]
+qc.e.andi x5, x7, 750
+
+# CHECK-ALIAS: ori t0, t2, 854
+# CHECK-NOALIAS: ori t0, t2, 854
+# CHECK-ENC: encoding: [0x93,0xe2,0x63,0x35]
+qc.e.ori x5, x7, 854
+
+# CHECK-ALIAS: xori t0, t2, -200
+# CHECK-NOALIAS: xori t0, t2, -200
+# CHECK-ENC: encoding: [0x93,0xc2,0x83,0xf3]
+qc.e.xori x5, x7, -200
+
+# CHECK-ALIAS: addi t2, t2, 400
+# CHECK-NOALIAS: addi t2, t2, 400
+# CHECK-ENC: encoding: [0x93,0x83,0x03,0x19]
+qc.e.addai x7, 400
+
+# CHECK-ALIAS: andi t2, t2, 750
+# CHECK-NOALIAS: andi t2, t2, 750
+# CHECK-ENC: encoding: [0x93,0xf3,0xe3,0x2e]
+qc.e.andai x7, 750
+
+# CHECK-ALIAS: ori t2, t2, 854
+# CHECK-NOALIAS: ori t2, t2, 854
+# CHECK-ENC: encoding: [0x93,0xe3,0x63,0x35]
+qc.e.orai x7, 854
+
+# CHECK-ALIAS: xori t2, t2, -200
+# CHECK-NOALIAS: xori t2, t2, -200
+# CHECK-ENC: encoding: [0x93,0xc3,0x83,0xf3]
+qc.e.xorai x7, -200
diff --git a/llvm/test/MC/Sparc/Relocations/reloc-directive.s b/llvm/test/MC/Sparc/Relocations/reloc-directive.s
index 8899408ee428d..26164b3c2eb38 100644
--- a/llvm/test/MC/Sparc/Relocations/reloc-directive.s
+++ b/llvm/test/MC/Sparc/Relocations/reloc-directive.s
@@ -3,15 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=sparc %s | llvm-readobj -r - | FileCheck %s
 # RUN: llvm-mc -filetype=obj -triple=sparcv9 %s | llvm-readobj -r - | FileCheck %s
 
-# PRINT: .reloc 8, R_SPARC_NONE, .data
-# PRINT: .reloc 4, R_SPARC_NONE, foo+4
-# PRINT: .reloc 0, R_SPARC_NONE, 8
-# PRINT: .reloc 0, R_SPARC_32, .data+2
-# PRINT: .reloc 0, R_SPARC_UA16, foo+3
-# PRINT: .reloc 0, R_SPARC_DISP32, foo+5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, foo+2
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, foo+3
+# PRINT: .reloc {{.*}}+8, R_SPARC_NONE, .data
 
 # CHECK:      0x8 R_SPARC_NONE .data 0x0
 # CHECK-NEXT: 0x4 R_SPARC_NONE foo 0x4
@@ -23,20 +15,20 @@
 # CHECK-NEXT: 0x0 R_SPARC_32 foo 0x2
 # CHECK-NEXT: 0x0 R_SPARC_64 foo 0x3
 .text
+  .reloc .+8, R_SPARC_NONE, .data
+  .reloc .+4, R_SPARC_NONE, foo+4
+  .reloc .+0, R_SPARC_NONE, 8
+
+  .reloc .+0, R_SPARC_32, .data+2
+  .reloc .+0, R_SPARC_UA16, foo+3
+  .reloc .+0, R_SPARC_DISP32, foo+5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_32, foo+2
+  .reloc .+0, BFD_RELOC_64, foo+3
   ret
   nop
   nop
-  .reloc 8, R_SPARC_NONE, .data
-  .reloc 4, R_SPARC_NONE, foo+4
-  .reloc 0, R_SPARC_NONE, 8
-
-  .reloc 0, R_SPARC_32, .data+2
-  .reloc 0, R_SPARC_UA16, foo+3
-  .reloc 0, R_SPARC_DISP32, foo+5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_32, foo+2
-  .reloc 0, BFD_RELOC_64, foo+3
 
 .data
 .globl foo
diff --git a/llvm/test/MC/SystemZ/reloc-directive.s b/llvm/test/MC/SystemZ/reloc-directive.s
index abc6ca320642d..78c36e1434574 100644
--- a/llvm/test/MC/SystemZ/reloc-directive.s
+++ b/llvm/test/MC/SystemZ/reloc-directive.s
@@ -3,19 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=s390x-linux-gnu %s -o %t
 # RUN: llvm-readobj -r %t | FileCheck %s
 
-# PRINT:      .reloc 2, R_390_NONE, .data
-# PRINT-NEXT: .reloc 1, R_390_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_390_NONE, 8
-# PRINT-NEXT: .reloc 0, R_390_64, .data+2
-# PRINT-NEXT: .reloc 0, R_390_GOTENT, foo+3
-# PRINT-NEXT: .reloc 0, R_390_PC32DBL, 6
-# PRINT-NEXT: .reloc 4, R_390_12, foo
-# PRINT-NEXT: .reloc 2, R_390_20, foo
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT:      .reloc {{.*}}+2, R_390_NONE, .data
 
 # CHECK:      0x2 R_390_NONE .data 0x0
 # CHECK-NEXT: 0x1 R_390_NONE foo 0x4
@@ -32,23 +20,23 @@
 # CHECK-NEXT: 0x0 R_390_64 - 0x9
 
 .text
+  .reloc .+2, R_390_NONE, .data
+  .reloc .+1, R_390_NONE, foo+4
+  .reloc .+0, R_390_NONE, 8
+  .reloc .+0, R_390_64, .data+2
+  .reloc .+0, R_390_GOTENT, foo+3
+  .reloc .+0, R_390_PC32DBL, 6
+  .reloc .+4, R_390_12, foo
+  .reloc .+2, R_390_20, foo
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   br %r14
   nop
   nop
-  .reloc 2, R_390_NONE, .data
-  .reloc 1, R_390_NONE, foo+4
-  .reloc 0, R_390_NONE, 8
-  .reloc 0, R_390_64, .data+2
-  .reloc 0, R_390_GOTENT, foo+3
-  .reloc 0, R_390_PC32DBL, 6
-  .reloc 4, R_390_12, foo
-  .reloc 2, R_390_20, foo
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/X86/AlignedBundling/align-mode-argument-error.s b/llvm/test/MC/X86/AlignedBundling/align-mode-argument-error.s
deleted file mode 100644
index 37c74c86f754e..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/align-mode-argument-error.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
-
-# Missing .bundle_align_mode argument
-# CHECK: error: unknown token
-
-  .bundle_align_mode
-  imull $17, %ebx, %ebp
-
diff --git a/llvm/test/MC/X86/AlignedBundling/asm-printing-bundle-directives.s b/llvm/test/MC/X86/AlignedBundling/asm-printing-bundle-directives.s
deleted file mode 100644
index 387e0fe59bf29..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/asm-printing-bundle-directives.s
+++ /dev/null
@@ -1,22 +0,0 @@
-# RUN: llvm-mc -filetype=asm -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
-
-# Just a simple test for the assembly emitter - making sure it emits back the
-# bundling directives.
-
-  .text
-foo:
-  .bundle_align_mode 4
-# CHECK:      .bundle_align_mode 4
-  pushq   %rbp
-  .bundle_lock
-# CHECK: .bundle_lock
-  cmpl    %r14d, %ebp
-  jle     .L_ELSE
-  .bundle_unlock
-# CHECK: .bundle_unlock
-  .bundle_lock align_to_end
-# CHECK: .bundle_lock align_to_end
-  add     %rbx, %rdx
-  .bundle_unlock
-
-
diff --git a/llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s b/llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
deleted file mode 100644
index 6dabafc394e99..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
+++ /dev/null
@@ -1,2899 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o - \
-# RUN:   | llvm-objdump --triple=i386 -d --no-show-raw-insn - | FileCheck %s
-
-# !!! This test is auto-generated from utils/testgen/mc-bundling-x86-gen.py !!!
-#     It tests that bundle-aligned grouping works correctly in MC. Read the
-#     source of the script for more details.
-
-  .text
-  .bundle_align_mode 4
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 0: nop
-# CHECK: f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 21: nop
-# CHECK: 2f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 42: nop
-# CHECK: 4f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 63: nop
-# CHECK: 6f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 84: nop
-# CHECK: 8f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a5: nop
-# CHECK: af: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c6: nop
-# CHECK: cf: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e7: nop
-# CHECK: ef: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 108: nop
-# CHECK: 10f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 129: nop
-# CHECK: 12f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14a: nop
-# CHECK: 14f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16b: nop
-# CHECK: 16f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18c: nop
-# CHECK: 18f: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ad: nop
-# CHECK: 1af: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ce: nop
-# CHECK: 1cf: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ef: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 200: nop
-# CHECK: 20e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 221: nop
-# CHECK: 22e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 242: nop
-# CHECK: 24e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 263: nop
-# CHECK: 26e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 284: nop
-# CHECK: 28e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 2a5: nop
-# CHECK: 2ae: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 2c6: nop
-# CHECK: 2ce: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 2e7: nop
-# CHECK: 2ee: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 308: nop
-# CHECK: 30e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 329: nop
-# CHECK: 32e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 34a: nop
-# CHECK: 34e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 36b: nop
-# CHECK: 36e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 38c: nop
-# CHECK: 38e: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 3ad: nop
-# CHECK: 3ae: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 3ce: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 3ef: nop
-# CHECK: 3f0: nop
-# CHECK: 3fe: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 400: nop
-# CHECK: 40d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 421: nop
-# CHECK: 42d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 442: nop
-# CHECK: 44d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 463: nop
-# CHECK: 46d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 484: nop
-# CHECK: 48d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 4a5: nop
-# CHECK: 4ad: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 4c6: nop
-# CHECK: 4cd: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 4e7: nop
-# CHECK: 4ed: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 508: nop
-# CHECK: 50d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 529: nop
-# CHECK: 52d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 54a: nop
-# CHECK: 54d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 56b: nop
-# CHECK: 56d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 58c: nop
-# CHECK: 58d: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 5ad: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 5ce: nop
-# CHECK: 5d0: nop
-# CHECK: 5dd: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 5ef: nop
-# CHECK: 5f0: nop
-# CHECK: 5fd: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 600: nop
-# CHECK: 60c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 621: nop
-# CHECK: 62c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 642: nop
-# CHECK: 64c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 663: nop
-# CHECK: 66c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 684: nop
-# CHECK: 68c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 6a5: nop
-# CHECK: 6ac: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 6c6: nop
-# CHECK: 6cc: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 6e7: nop
-# CHECK: 6ec: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 708: nop
-# CHECK: 70c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 729: nop
-# CHECK: 72c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 74a: nop
-# CHECK: 74c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 76b: nop
-# CHECK: 76c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 78c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 7ad: nop
-# CHECK: 7b0: nop
-# CHECK: 7bc: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 7ce: nop
-# CHECK: 7d0: nop
-# CHECK: 7dc: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 7ef: nop
-# CHECK: 7f0: nop
-# CHECK: 7fc: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 800: nop
-# CHECK: 80b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 821: nop
-# CHECK: 82b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 842: nop
-# CHECK: 84b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 863: nop
-# CHECK: 86b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 884: nop
-# CHECK: 88b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 8a5: nop
-# CHECK: 8ab: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 8c6: nop
-# CHECK: 8cb: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 8e7: nop
-# CHECK: 8eb: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 908: nop
-# CHECK: 90b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 929: nop
-# CHECK: 92b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 94a: nop
-# CHECK: 94b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 96b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 98c: nop
-# CHECK: 990: nop
-# CHECK: 99b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 9ad: nop
-# CHECK: 9b0: nop
-# CHECK: 9bb: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 9ce: nop
-# CHECK: 9d0: nop
-# CHECK: 9db: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 9ef: nop
-# CHECK: 9f0: nop
-# CHECK: 9fb: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a00: nop
-# CHECK: a0a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a21: nop
-# CHECK: a2a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a42: nop
-# CHECK: a4a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a63: nop
-# CHECK: a6a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a84: nop
-# CHECK: a8a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: aa5: nop
-# CHECK: aaa: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ac6: nop
-# CHECK: aca: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ae7: nop
-# CHECK: aea: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b08: nop
-# CHECK: b0a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b29: nop
-# CHECK: b2a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b4a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b6b: nop
-# CHECK: b70: nop
-# CHECK: b7a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b8c: nop
-# CHECK: b90: nop
-# CHECK: b9a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: bad: nop
-# CHECK: bb0: nop
-# CHECK: bba: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: bce: nop
-# CHECK: bd0: nop
-# CHECK: bda: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: bef: nop
-# CHECK: bf0: nop
-# CHECK: bfa: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c00: nop
-# CHECK: c09: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c21: nop
-# CHECK: c29: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c42: nop
-# CHECK: c49: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c63: nop
-# CHECK: c69: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c84: nop
-# CHECK: c89: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ca5: nop
-# CHECK: ca9: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: cc6: nop
-# CHECK: cc9: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ce7: nop
-# CHECK: ce9: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d08: nop
-# CHECK: d09: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d29: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d4a: nop
-# CHECK: d50: nop
-# CHECK: d59: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d6b: nop
-# CHECK: d70: nop
-# CHECK: d79: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d8c: nop
-# CHECK: d90: nop
-# CHECK: d99: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: dad: nop
-# CHECK: db0: nop
-# CHECK: db9: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: dce: nop
-# CHECK: dd0: nop
-# CHECK: dd9: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: def: nop
-# CHECK: df0: nop
-# CHECK: df9: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e00: nop
-# CHECK: e08: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e21: nop
-# CHECK: e28: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e42: nop
-# CHECK: e48: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e63: nop
-# CHECK: e68: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e84: nop
-# CHECK: e88: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ea5: nop
-# CHECK: ea8: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ec6: nop
-# CHECK: ec8: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ee7: nop
-# CHECK: ee8: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f08: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f29: nop
-# CHECK: f30: nop
-# CHECK: f38: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f4a: nop
-# CHECK: f50: nop
-# CHECK: f58: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f6b: nop
-# CHECK: f70: nop
-# CHECK: f78: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f8c: nop
-# CHECK: f90: nop
-# CHECK: f98: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: fad: nop
-# CHECK: fb0: nop
-# CHECK: fb8: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: fce: nop
-# CHECK: fd0: nop
-# CHECK: fd8: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: fef: nop
-# CHECK: ff0: nop
-# CHECK: ff8: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1000: nop
-# CHECK: 1007: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1021: nop
-# CHECK: 1027: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1042: nop
-# CHECK: 1047: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1063: nop
-# CHECK: 1067: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1084: nop
-# CHECK: 1087: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 10a5: nop
-# CHECK: 10a7: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 10c6: nop
-# CHECK: 10c7: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 10e7: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1108: nop
-# CHECK: 1110: nop
-# CHECK: 1117: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1129: nop
-# CHECK: 1130: nop
-# CHECK: 1137: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 114a: nop
-# CHECK: 1150: nop
-# CHECK: 1157: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 116b: nop
-# CHECK: 1170: nop
-# CHECK: 1177: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 118c: nop
-# CHECK: 1190: nop
-# CHECK: 1197: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 11ad: nop
-# CHECK: 11b0: nop
-# CHECK: 11b7: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 11ce: nop
-# CHECK: 11d0: nop
-# CHECK: 11d7: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 11ef: nop
-# CHECK: 11f0: nop
-# CHECK: 11f7: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1200: nop
-# CHECK: 1206: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1221: nop
-# CHECK: 1226: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1242: nop
-# CHECK: 1246: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1263: nop
-# CHECK: 1266: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1284: nop
-# CHECK: 1286: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 12a5: nop
-# CHECK: 12a6: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 12c6: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 12e7: nop
-# CHECK: 12f0: nop
-# CHECK: 12f6: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1308: nop
-# CHECK: 1310: nop
-# CHECK: 1316: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1329: nop
-# CHECK: 1330: nop
-# CHECK: 1336: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 134a: nop
-# CHECK: 1350: nop
-# CHECK: 1356: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 136b: nop
-# CHECK: 1370: nop
-# CHECK: 1376: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 138c: nop
-# CHECK: 1390: nop
-# CHECK: 1396: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 13ad: nop
-# CHECK: 13b0: nop
-# CHECK: 13b6: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 13ce: nop
-# CHECK: 13d0: nop
-# CHECK: 13d6: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 13ef: nop
-# CHECK: 13f0: nop
-# CHECK: 13f6: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1400: nop
-# CHECK: 1405: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1421: nop
-# CHECK: 1425: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1442: nop
-# CHECK: 1445: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1463: nop
-# CHECK: 1465: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1484: nop
-# CHECK: 1485: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14a5: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14c6: nop
-# CHECK: 14d0: nop
-# CHECK: 14d5: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14e7: nop
-# CHECK: 14f0: nop
-# CHECK: 14f5: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1508: nop
-# CHECK: 1510: nop
-# CHECK: 1515: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1529: nop
-# CHECK: 1530: nop
-# CHECK: 1535: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 154a: nop
-# CHECK: 1550: nop
-# CHECK: 1555: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 156b: nop
-# CHECK: 1570: nop
-# CHECK: 1575: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 158c: nop
-# CHECK: 1590: nop
-# CHECK: 1595: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 15ad: nop
-# CHECK: 15b0: nop
-# CHECK: 15b5: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 15ce: nop
-# CHECK: 15d0: nop
-# CHECK: 15d5: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 15ef: nop
-# CHECK: 15f0: nop
-# CHECK: 15f5: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1600: nop
-# CHECK: 1604: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1621: nop
-# CHECK: 1624: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1642: nop
-# CHECK: 1644: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1663: nop
-# CHECK: 1664: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1684: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16a5: nop
-# CHECK: 16b0: nop
-# CHECK: 16b4: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16c6: nop
-# CHECK: 16d0: nop
-# CHECK: 16d4: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16e7: nop
-# CHECK: 16f0: nop
-# CHECK: 16f4: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1708: nop
-# CHECK: 1710: nop
-# CHECK: 1714: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1729: nop
-# CHECK: 1730: nop
-# CHECK: 1734: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 174a: nop
-# CHECK: 1750: nop
-# CHECK: 1754: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 176b: nop
-# CHECK: 1770: nop
-# CHECK: 1774: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 178c: nop
-# CHECK: 1790: nop
-# CHECK: 1794: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 17ad: nop
-# CHECK: 17b0: nop
-# CHECK: 17b4: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 17ce: nop
-# CHECK: 17d0: nop
-# CHECK: 17d4: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 17ef: nop
-# CHECK: 17f0: nop
-# CHECK: 17f4: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1800: nop
-# CHECK: 1803: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1821: nop
-# CHECK: 1823: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1842: nop
-# CHECK: 1843: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1863: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1884: nop
-# CHECK: 1890: nop
-# CHECK: 1893: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18a5: nop
-# CHECK: 18b0: nop
-# CHECK: 18b3: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18c6: nop
-# CHECK: 18d0: nop
-# CHECK: 18d3: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18e7: nop
-# CHECK: 18f0: nop
-# CHECK: 18f3: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1908: nop
-# CHECK: 1910: nop
-# CHECK: 1913: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1929: nop
-# CHECK: 1930: nop
-# CHECK: 1933: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 194a: nop
-# CHECK: 1950: nop
-# CHECK: 1953: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 196b: nop
-# CHECK: 1970: nop
-# CHECK: 1973: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 198c: nop
-# CHECK: 1990: nop
-# CHECK: 1993: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 19ad: nop
-# CHECK: 19b0: nop
-# CHECK: 19b3: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 19ce: nop
-# CHECK: 19d0: nop
-# CHECK: 19d3: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 19ef: nop
-# CHECK: 19f0: nop
-# CHECK: 19f3: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a00: nop
-# CHECK: 1a02: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a21: nop
-# CHECK: 1a22: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a42: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a63: nop
-# CHECK: 1a70: nop
-# CHECK: 1a72: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a84: nop
-# CHECK: 1a90: nop
-# CHECK: 1a92: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1aa5: nop
-# CHECK: 1ab0: nop
-# CHECK: 1ab2: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ac6: nop
-# CHECK: 1ad0: nop
-# CHECK: 1ad2: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ae7: nop
-# CHECK: 1af0: nop
-# CHECK: 1af2: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b08: nop
-# CHECK: 1b10: nop
-# CHECK: 1b12: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b29: nop
-# CHECK: 1b30: nop
-# CHECK: 1b32: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b4a: nop
-# CHECK: 1b50: nop
-# CHECK: 1b52: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b6b: nop
-# CHECK: 1b70: nop
-# CHECK: 1b72: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b8c: nop
-# CHECK: 1b90: nop
-# CHECK: 1b92: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1bad: nop
-# CHECK: 1bb0: nop
-# CHECK: 1bb2: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1bce: nop
-# CHECK: 1bd0: nop
-# CHECK: 1bd2: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1bef: nop
-# CHECK: 1bf0: nop
-# CHECK: 1bf2: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c00: nop
-# CHECK: 1c01: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c21: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c42: nop
-# CHECK: 1c50: nop
-# CHECK: 1c51: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c63: nop
-# CHECK: 1c70: nop
-# CHECK: 1c71: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c84: nop
-# CHECK: 1c90: nop
-# CHECK: 1c91: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ca5: nop
-# CHECK: 1cb0: nop
-# CHECK: 1cb1: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1cc6: nop
-# CHECK: 1cd0: nop
-# CHECK: 1cd1: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ce7: nop
-# CHECK: 1cf0: nop
-# CHECK: 1cf1: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d08: nop
-# CHECK: 1d10: nop
-# CHECK: 1d11: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d29: nop
-# CHECK: 1d30: nop
-# CHECK: 1d31: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d4a: nop
-# CHECK: 1d50: nop
-# CHECK: 1d51: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d6b: nop
-# CHECK: 1d70: nop
-# CHECK: 1d71: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d8c: nop
-# CHECK: 1d90: nop
-# CHECK: 1d91: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1dad: nop
-# CHECK: 1db0: nop
-# CHECK: 1db1: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1dce: nop
-# CHECK: 1dd0: nop
-# CHECK: 1dd1: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1def: nop
-# CHECK: 1df0: nop
-# CHECK: 1df1: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_0:
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e00: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e21: nop
-# CHECK: 1e30: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e42: nop
-# CHECK: 1e50: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e63: nop
-# CHECK: 1e70: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e84: nop
-# CHECK: 1e90: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ea5: nop
-# CHECK: 1eb0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ec6: nop
-# CHECK: 1ed0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ee7: nop
-# CHECK: 1ef0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f08: nop
-# CHECK: 1f10: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f29: nop
-# CHECK: 1f30: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f4a: nop
-# CHECK: 1f50: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f6b: nop
-# CHECK: 1f70: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f8c: nop
-# CHECK: 1f90: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1fad: nop
-# CHECK: 1fb0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1fce: nop
-# CHECK: 1fd0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock align_to_end
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1fef: nop
-# CHECK: 1ff0: incl
-
diff --git a/llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-padding.s b/llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-padding.s
deleted file mode 100644
index d486ec5fca00e..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/autogen-inst-offset-padding.s
+++ /dev/null
@@ -1,2674 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o - \
-# RUN:   | llvm-objdump --triple=i386 -d --no-show-raw-insn - | FileCheck %s
-
-# !!! This test is auto-generated from utils/testgen/mc-bundling-x86-gen.py !!!
-#     It tests that bundle-aligned grouping works correctly in MC. Read the
-#     source of the script for more details.
-
-  .text
-  .bundle_align_mode 4
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_0:
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 0: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 21: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 42: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 63: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 84: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a5: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c6: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e7: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 108: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 129: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14a: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16b: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18c: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ad: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ce: incl
-
-  .align 32, 0x90
-INSTRLEN_1_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 1
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ef: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_0:
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 200: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 221: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 242: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 263: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 284: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 2a5: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 2c6: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 2e7: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 308: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 329: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 34a: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 36b: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 38c: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 3ad: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 3ce: incl
-
-  .align 32, 0x90
-INSTRLEN_2_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 2
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 3ef: nop
-# CHECK: 3f0: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_0:
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 400: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 421: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 442: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 463: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 484: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 4a5: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 4c6: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 4e7: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 508: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 529: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 54a: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 56b: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 58c: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 5ad: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 5ce: nop
-# CHECK: 5d0: incl
-
-  .align 32, 0x90
-INSTRLEN_3_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 3
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 5ef: nop
-# CHECK: 5f0: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_0:
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 600: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 621: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 642: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 663: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 684: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 6a5: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 6c6: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 6e7: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 708: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 729: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 74a: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 76b: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 78c: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 7ad: nop
-# CHECK: 7b0: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 7ce: nop
-# CHECK: 7d0: incl
-
-  .align 32, 0x90
-INSTRLEN_4_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 4
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 7ef: nop
-# CHECK: 7f0: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_0:
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 800: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 821: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 842: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 863: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 884: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 8a5: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 8c6: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 8e7: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 908: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 929: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 94a: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 96b: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 98c: nop
-# CHECK: 990: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 9ad: nop
-# CHECK: 9b0: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 9ce: nop
-# CHECK: 9d0: incl
-
-  .align 32, 0x90
-INSTRLEN_5_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 5
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 9ef: nop
-# CHECK: 9f0: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_0:
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a00: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a21: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a42: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a63: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: a84: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: aa5: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ac6: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ae7: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b08: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b29: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b4a: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b6b: nop
-# CHECK: b70: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: b8c: nop
-# CHECK: b90: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: bad: nop
-# CHECK: bb0: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: bce: nop
-# CHECK: bd0: incl
-
-  .align 32, 0x90
-INSTRLEN_6_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 6
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: bef: nop
-# CHECK: bf0: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_0:
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c00: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c21: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c42: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c63: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: c84: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ca5: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: cc6: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ce7: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d08: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d29: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d4a: nop
-# CHECK: d50: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d6b: nop
-# CHECK: d70: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: d8c: nop
-# CHECK: d90: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: dad: nop
-# CHECK: db0: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: dce: nop
-# CHECK: dd0: incl
-
-  .align 32, 0x90
-INSTRLEN_7_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 7
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: def: nop
-# CHECK: df0: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_0:
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e00: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e21: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e42: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e63: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: e84: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ea5: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ec6: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: ee7: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f08: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f29: nop
-# CHECK: f30: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f4a: nop
-# CHECK: f50: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f6b: nop
-# CHECK: f70: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: f8c: nop
-# CHECK: f90: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: fad: nop
-# CHECK: fb0: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: fce: nop
-# CHECK: fd0: incl
-
-  .align 32, 0x90
-INSTRLEN_8_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 8
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: fef: nop
-# CHECK: ff0: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_0:
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1000: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1021: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1042: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1063: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1084: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 10a5: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 10c6: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 10e7: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1108: nop
-# CHECK: 1110: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1129: nop
-# CHECK: 1130: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 114a: nop
-# CHECK: 1150: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 116b: nop
-# CHECK: 1170: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 118c: nop
-# CHECK: 1190: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 11ad: nop
-# CHECK: 11b0: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 11ce: nop
-# CHECK: 11d0: incl
-
-  .align 32, 0x90
-INSTRLEN_9_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 9
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 11ef: nop
-# CHECK: 11f0: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_0:
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1200: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1221: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1242: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1263: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1284: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 12a5: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 12c6: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 12e7: nop
-# CHECK: 12f0: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1308: nop
-# CHECK: 1310: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1329: nop
-# CHECK: 1330: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 134a: nop
-# CHECK: 1350: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 136b: nop
-# CHECK: 1370: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 138c: nop
-# CHECK: 1390: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 13ad: nop
-# CHECK: 13b0: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 13ce: nop
-# CHECK: 13d0: incl
-
-  .align 32, 0x90
-INSTRLEN_10_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 10
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 13ef: nop
-# CHECK: 13f0: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_0:
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1400: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1421: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1442: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1463: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1484: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14a5: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14c6: nop
-# CHECK: 14d0: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 14e7: nop
-# CHECK: 14f0: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1508: nop
-# CHECK: 1510: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1529: nop
-# CHECK: 1530: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 154a: nop
-# CHECK: 1550: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 156b: nop
-# CHECK: 1570: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 158c: nop
-# CHECK: 1590: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 15ad: nop
-# CHECK: 15b0: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 15ce: nop
-# CHECK: 15d0: incl
-
-  .align 32, 0x90
-INSTRLEN_11_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 11
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 15ef: nop
-# CHECK: 15f0: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_0:
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1600: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1621: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1642: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1663: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1684: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16a5: nop
-# CHECK: 16b0: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16c6: nop
-# CHECK: 16d0: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 16e7: nop
-# CHECK: 16f0: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1708: nop
-# CHECK: 1710: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1729: nop
-# CHECK: 1730: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 174a: nop
-# CHECK: 1750: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 176b: nop
-# CHECK: 1770: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 178c: nop
-# CHECK: 1790: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 17ad: nop
-# CHECK: 17b0: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 17ce: nop
-# CHECK: 17d0: incl
-
-  .align 32, 0x90
-INSTRLEN_12_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 12
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 17ef: nop
-# CHECK: 17f0: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_0:
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1800: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1821: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1842: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1863: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1884: nop
-# CHECK: 1890: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18a5: nop
-# CHECK: 18b0: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18c6: nop
-# CHECK: 18d0: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 18e7: nop
-# CHECK: 18f0: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1908: nop
-# CHECK: 1910: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1929: nop
-# CHECK: 1930: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 194a: nop
-# CHECK: 1950: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 196b: nop
-# CHECK: 1970: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 198c: nop
-# CHECK: 1990: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 19ad: nop
-# CHECK: 19b0: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 19ce: nop
-# CHECK: 19d0: incl
-
-  .align 32, 0x90
-INSTRLEN_13_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 13
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 19ef: nop
-# CHECK: 19f0: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_0:
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a00: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a21: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a42: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a63: nop
-# CHECK: 1a70: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1a84: nop
-# CHECK: 1a90: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1aa5: nop
-# CHECK: 1ab0: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ac6: nop
-# CHECK: 1ad0: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ae7: nop
-# CHECK: 1af0: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b08: nop
-# CHECK: 1b10: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b29: nop
-# CHECK: 1b30: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b4a: nop
-# CHECK: 1b50: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b6b: nop
-# CHECK: 1b70: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1b8c: nop
-# CHECK: 1b90: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1bad: nop
-# CHECK: 1bb0: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1bce: nop
-# CHECK: 1bd0: incl
-
-  .align 32, 0x90
-INSTRLEN_14_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 14
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1bef: nop
-# CHECK: 1bf0: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_0:
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c00: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c21: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c42: nop
-# CHECK: 1c50: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c63: nop
-# CHECK: 1c70: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1c84: nop
-# CHECK: 1c90: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ca5: nop
-# CHECK: 1cb0: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1cc6: nop
-# CHECK: 1cd0: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ce7: nop
-# CHECK: 1cf0: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d08: nop
-# CHECK: 1d10: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d29: nop
-# CHECK: 1d30: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d4a: nop
-# CHECK: 1d50: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d6b: nop
-# CHECK: 1d70: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1d8c: nop
-# CHECK: 1d90: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1dad: nop
-# CHECK: 1db0: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1dce: nop
-# CHECK: 1dd0: incl
-
-  .align 32, 0x90
-INSTRLEN_15_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 15
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1def: nop
-# CHECK: 1df0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_0:
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e00: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_1:
-  .fill 1, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e21: nop
-# CHECK: 1e30: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_2:
-  .fill 2, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e42: nop
-# CHECK: 1e50: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_3:
-  .fill 3, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e63: nop
-# CHECK: 1e70: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_4:
-  .fill 4, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1e84: nop
-# CHECK: 1e90: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_5:
-  .fill 5, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ea5: nop
-# CHECK: 1eb0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_6:
-  .fill 6, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ec6: nop
-# CHECK: 1ed0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_7:
-  .fill 7, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1ee7: nop
-# CHECK: 1ef0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_8:
-  .fill 8, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f08: nop
-# CHECK: 1f10: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_9:
-  .fill 9, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f29: nop
-# CHECK: 1f30: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_10:
-  .fill 10, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f4a: nop
-# CHECK: 1f50: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_11:
-  .fill 11, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f6b: nop
-# CHECK: 1f70: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_12:
-  .fill 12, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1f8c: nop
-# CHECK: 1f90: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_13:
-  .fill 13, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1fad: nop
-# CHECK: 1fb0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_14:
-  .fill 14, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1fce: nop
-# CHECK: 1fd0: incl
-
-  .align 32, 0x90
-INSTRLEN_16_OFFSET_15:
-  .fill 15, 1, 0x90
-  .bundle_lock
-  .rept 16
-  inc %eax
-  .endr
-  .bundle_unlock
-# CHECK: 1fef: nop
-# CHECK: 1ff0: incl
-
diff --git a/llvm/test/MC/X86/AlignedBundling/bundle-group-too-large-error.s b/llvm/test/MC/X86/AlignedBundling/bundle-group-too-large-error.s
deleted file mode 100644
index 697b8bf6ab6c0..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/bundle-group-too-large-error.s
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - 2>&1 | FileCheck %s
-
-# CHECK: ERROR: Fragment can't be larger than a bundle size
-
-  .text
-foo:
-  .bundle_align_mode 4
-  pushq   %rbp
-
-  .bundle_lock
-  pushq   %r14
-  callq   bar
-  callq   bar
-  callq   bar
-  callq   bar
-  .bundle_unlock
-
diff --git a/llvm/test/MC/X86/AlignedBundling/bundle-lock-option-error.s b/llvm/test/MC/X86/AlignedBundling/bundle-lock-option-error.s
deleted file mode 100644
index b849d2b333007..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/bundle-lock-option-error.s
+++ /dev/null
@@ -1,11 +0,0 @@
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
-
-# Missing .bundle_align_mode argument
-# CHECK: error: invalid option
-
-  .bundle_align_mode 4
-  .bundle_lock 5
-  imull $17, %ebx, %ebp
-  .bundle_unlock
-
-
diff --git a/llvm/test/MC/X86/AlignedBundling/bundle-subtarget-change-error.s b/llvm/test/MC/X86/AlignedBundling/bundle-subtarget-change-error.s
deleted file mode 100644
index c02d0d6b19b4e..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/bundle-subtarget-change-error.s
+++ /dev/null
@@ -1,16 +0,0 @@
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - 2>&1 | FileCheck %s
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - 2>&1 | FileCheck %s
-
-# Switching mode will change subtarget, which we can't do within a bundle
-  .text
-  .code64
-  .bundle_align_mode 4
-foo:
-  pushq   %rbp
-  .bundle_lock
-  addl    %ebp, %eax
-  .code32
-  movb  $0x0, (%si)
-  .bundle_unlock
-
-CHECK:  LLVM ERROR: A Bundle can only have one Subtarget.
diff --git a/llvm/test/MC/X86/AlignedBundling/different-sections.s b/llvm/test/MC/X86/AlignedBundling/different-sections.s
deleted file mode 100644
index bc4f20132dbe7..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/different-sections.s
+++ /dev/null
@@ -1,27 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-
-# Test two different executable sections with bundling.
-
-  .bundle_align_mode 3
-  .section text1, "x"
-# CHECK: section text1
-  imull $17, %ebx, %ebp
-  imull $17, %ebx, %ebp
-
-  imull $17, %ebx, %ebp
-# CHECK:      6: nop
-# CHECK-NEXT: 8: imull
-
-  .section text2, "x"
-# CHECK: section text2
-  imull $17, %ebx, %ebp
-  imull $17, %ebx, %ebp
-
-  imull $17, %ebx, %ebp
-# CHECK:      6: nop
-# CHECK-NEXT: 8: imull
-
-
diff --git a/llvm/test/MC/X86/AlignedBundling/labeloffset.s b/llvm/test/MC/X86/AlignedBundling/labeloffset.s
deleted file mode 100644
index 2850aba718430..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/labeloffset.s
+++ /dev/null
@@ -1,85 +0,0 @@
-# RUN: llvm-mc -triple=i686-linux -filetype=obj %s -o - | \
-# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn -r - | FileCheck %s
-# RUN: llvm-mc -triple=i686-nacl -filetype=obj %s -o - | \
-# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn -r - | FileCheck %s
-# RUN: llvm-mc -triple=i686-nacl -filetype=obj -mc-relax-all %s -o - | \
-# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn -r - | FileCheck %s
-
-        .bundle_align_mode 5
-        .text
-        .globl  main
-        .align  32, 0x90
-        .type   main,@function
-main:                                   # @main
-# CHECK-LABEL: <main>:
-# Call + pop sequence for determining the PIC base.
-        .bundle_lock align_to_end
-        calll   .L0$pb
-        .bundle_unlock
-.L0$pb:
-        popl    %eax
-# CHECK: 20: popl
-# 26 bytes of instructions between the pop and the use of the pic base symbol.
-        movl    $3, 2(%ebx, %ebx)
-        movl    $3, 2(%ebx, %ebx)
-        movl    $3, 2(%ebx, %ebx)
-        hlt
-        hlt
-# CHECK: nop
-.Ltmp0:
-        addl    (.Ltmp0-.L0$pb), %eax
-# The addl has bundle padding to push it from 0x3b to 0x40.
-# The difference between the labels should be 0x20 (0x40-0x20) not 0x1b
-# (0x3b-0x20)
-# CHECK: 40: addl 32, %eax
-        popl    %ecx
-        jmp     *%ecx
-
-
-# Also make sure it works with a non-relaxable instruction (cmp vs add)
-# and for 2 adjacent labels that both point to the correct instruction
-        .section .text.bar, "ax"
-        .globl  bar
-        .align  32, 0x90
-        .type   bar,@function
-bar:
-# CHECK-LABEL: bar:
-        .bundle_lock align_to_end
-        calll   .L1$pb
-        .bundle_unlock
-.L1$pb:
-        popl %eax
-# CHECK: 20: popl
-# 26 bytes of instructions between the pop and the use of the pic base symbol.
-        movl    $3, 2(%ebx, %ebx)
-        movl    $3, 2(%ebx, %ebx)
-        movl    $3, 2(%ebx, %ebx)
-        hlt
-        hlt
-# CHECK: nop
-.Ltmp1:
-.Ltmp2:
-        cmpl    %eax, .Ltmp1
-# CHECK: 40: cmpl %eax, 64
-        cmpl     %eax, (.Ltmp2-.L1$pb)
-# CHECK: 46: cmpl %eax, 32
-        popl    %ecx
-        jmp *%ecx
-
-
-# Switch sections in the middle of a function
-        .section .text.foo, "ax"
-        .globl  foo
-        .align  32, 0x90
-        .type   foo,@function
-# CHECK-LABEL: foo:
-foo:
-        inc %eax
-tmp3:
-        .rodata
-        .type   obj,@object
-        .comm   obj,4,4
-        .section .text.foo
-        inc %eax
-# CHECK: <tmp3>:
-# CHECK-NEXT: 1: incl
diff --git a/llvm/test/MC/X86/AlignedBundling/lit.local.cfg b/llvm/test/MC/X86/AlignedBundling/lit.local.cfg
deleted file mode 100644
index 42bf50dcc13c3..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not "X86" in config.root.targets:
-    config.unsupported = True
diff --git a/llvm/test/MC/X86/AlignedBundling/lock-without-bundle-mode-error.s b/llvm/test/MC/X86/AlignedBundling/lock-without-bundle-mode-error.s
deleted file mode 100644
index 65ee2d5b9f385..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/lock-without-bundle-mode-error.s
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
-
-# .bundle_lock can't come without a .bundle_align_mode before it
-
-# CHECK: ERROR: .bundle_lock forbidden when bundling is disabled
-
-  imull $17, %ebx, %ebp
-  .bundle_lock
-
-
diff --git a/llvm/test/MC/X86/AlignedBundling/long-nop-pad.s b/llvm/test/MC/X86/AlignedBundling/long-nop-pad.s
deleted file mode 100644
index 439915dfd0b5b..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/long-nop-pad.s
+++ /dev/null
@@ -1,31 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-
-# Test that long nops are generated for padding where possible.
-
-  .text
-foo:
-  .bundle_align_mode 5
-
-# This callq instruction is 5 bytes long
-  .bundle_lock align_to_end
-  callq   bar
-  .bundle_unlock
-# To align this group to a bundle end, we need a two 10-byte NOPs and a 7-byte NOP.
-# CHECK:        0:  nop
-# CHECK-NEXT:   a:  nop
-# CHECK-NEXT:   14: nop
-# CHECK:   1b: callq
-
-# This push instruction is 1 byte long
-  .bundle_lock align_to_end
-  push %rax
-  .bundle_unlock
-# To align this group to a bundle end, we need three 10-byte NOPs, and a 1-byte.
-# CHECK:        20:  nop
-# CHECK-NEXT:   2a:  nop
-# CHECK-NEXT:   34:  nop
-# CHECK-NEXT:   3e:  nop
-# CHECK-NEXT:   3f: pushq
diff --git a/llvm/test/MC/X86/AlignedBundling/misaligned-bundle-group.s b/llvm/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
deleted file mode 100644
index 92bd9ec016bd5..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/misaligned-bundle-group.s
+++ /dev/null
@@ -1,19 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - \
-# RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - \
-# RUN:   | FileCheck --check-prefixes=CHECK,CHECK-OPT %s
-
-        .text
-foo:
-        .bundle_align_mode 5
-        push    %ebp # 1 byte
-        .align  16
-        .bundle_lock align_to_end
-# CHECK:            1:  nopw %cs:(%eax,%eax)
-# CHECK:            10: nopw %cs:(%eax,%eax)
-# CHECK-OPT:        1b: calll 0x1c
-        calll   bar # 5 bytes
-        .bundle_unlock
-        ret         # 1 byte
diff --git a/llvm/test/MC/X86/AlignedBundling/misaligned-bundle.s b/llvm/test/MC/X86/AlignedBundling/misaligned-bundle.s
deleted file mode 100644
index 0bf5cfd802be9..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/misaligned-bundle.s
+++ /dev/null
@@ -1,26 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump --no-print-imm-hex -d --no-show-raw-insn - \
-# RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump --no-print-imm-hex -d --no-show-raw-insn - \
-# RUN:   | FileCheck --check-prefixes=CHECK,CHECK-OPT %s
-
-        .text
-foo:
-        .bundle_align_mode 5
-        push    %ebp          # 1 byte
-        .align  16
-# CHECK:            1:  nopw %cs:(%eax,%eax)
-# CHECK-OPT:        10: movl $1, (%esp)
-        movl $0x1, (%esp)     # 7 bytes
-        movl $0x1, (%esp)     # 7 bytes
-# CHECK-OPT:        1e: nop
-        movl $0x2, 0x1(%esp)  # 8 bytes
-        movl $0x2, 0x1(%esp)  # 8 bytes
-        movl $0x2, 0x1(%esp)  # 8 bytes
-        movl $0x2, (%esp)     # 7 bytes
-# CHECK-OPT:        3f: nop
-# CHECK-OPT:        40: movl $3, (%esp)
-        movl $0x3, (%esp)     # 7 bytes
-        movl $0x3, (%esp)     # 7 bytes
-        ret
diff --git a/llvm/test/MC/X86/AlignedBundling/nesting.s b/llvm/test/MC/X86/AlignedBundling/nesting.s
deleted file mode 100644
index e107a413521b3..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/nesting.s
+++ /dev/null
@@ -1,73 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-
-# Will be bundle-aligning to 16 byte boundaries
-  .bundle_align_mode 4
-  .text
-# CHECK-LABEL: <foo>:
-.type   foo,@function
-foo:
-# Test that bundle alignment mode can be set more than once.
-  .bundle_align_mode 4
-# Each of these callq instructions is 5 bytes long
-  callq bar
-  callq bar
-  .bundle_lock
-  .bundle_lock
-  callq bar
-  callq bar     
-  .bundle_unlock
-  .bundle_unlock
-# CHECK:      10: callq {{.*}} <bar>
-# CHECK-NEXT: 15: callq {{.*}} <bar>
-
-  .p2align 4
-# CHECK-LABEL: <bar>:
-.type   bar,@function
-bar:
-  callq foo
-  callq foo
-# Check that the callqs get bundled together, and that the whole group is
-# align_to_end
-  .bundle_lock 
-  callq bar
-  .bundle_lock align_to_end
-  callq bar
-  .bundle_unlock
-  .bundle_unlock
-# CHECK:      36: callq {{.*}} <bar>
-# CHECK-NEXT: 3b: callq {{.*}} <bar>
-
-# CHECK-LABEL: <baz>:
-.type   baz,@function
-baz:
-  callq foo
-  callq foo
-# Check that the callqs get bundled together, and that the whole group is
-# align_to_end (with the outer directive marked align_to_end)
-  .bundle_lock align_to_end
-  callq bar
-  .bundle_lock
-  callq bar
-  .bundle_unlock
-  .bundle_unlock
-# CHECK:      56: callq {{.*}} <bar>
-# CHECK-NEXT: 5b: callq {{.*}} <bar>
-
-# CHECK-LABEL: quux
-.type   quux,@function
-quux:
-  callq bar
-  callq bar
-  .bundle_lock
-  .bundle_lock
-  callq bar
-  .bundle_unlock
-  callq bar     
-  .bundle_unlock
-# Check that the calls are bundled together when the second one is after the
-# inner nest is closed.
-# CHECK:      70: callq {{.*}} <bar>
-# CHECK-NEXT: 75: callq {{.*}} <bar>
diff --git a/llvm/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s b/llvm/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
deleted file mode 100644
index d725724f8fd44..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
+++ /dev/null
@@ -1,35 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-
-# Test some variations of padding to the end of a bundle.
-
-  .text
-foo:
-  .bundle_align_mode 4
-
-# Each of these callq instructions is 5 bytes long
-  callq   bar
-  callq   bar
-  .bundle_lock align_to_end
-  callq   bar
-  .bundle_unlock
-# To align this group to a bundle end, we need a 1-byte NOP.
-# CHECK:        a:  nop
-# CHECK-NEXT:   b: callq
-
-  callq   bar
-  callq   bar
-  .bundle_lock align_to_end
-  callq   bar
-  callq   bar
-  .bundle_unlock
-# Here we have to pad until the end of the *next* boundary because
-# otherwise the group crosses a boundary.
-# CHECK:      1a: nop
-# The nop sequence may be implemented as one instruction or many, but if
-# it's one instruction, that instruction cannot itself cross the boundary.
-# CHECK:      20: nop
-# CHECK-NEXT: 26: callq
-# CHECK-NEXT: 2b: callq
diff --git a/llvm/test/MC/X86/AlignedBundling/pad-bundle-groups.s b/llvm/test/MC/X86/AlignedBundling/pad-bundle-groups.s
deleted file mode 100644
index 9e07b5994864b..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/pad-bundle-groups.s
+++ /dev/null
@@ -1,49 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-
-# Test some variations of padding for bundle-locked groups.
-
-  .text
-foo:
-  .bundle_align_mode 4
-
-# Each of these callq instructions is 5 bytes long
-  callq   bar
-  callq   bar
-
-  .bundle_lock
-  callq   bar
-  callq   bar
-  .bundle_unlock
-# We'll need a 6-byte NOP before this group
-# CHECK:        a:  nop
-# CHECK-NEXT:   10: callq
-# CHECK-NEXT:   15: callq
-
-  .bundle_lock
-  callq   bar
-  callq   bar
-  .bundle_unlock
-# Same here
-# CHECK:        1a:  nop
-# CHECK-NEXT:   20: callq
-# CHECK-NEXT:   25: callq
-
-  .align 16, 0x90
-  callq   bar
-  .bundle_lock
-  callq   bar
-  callq   bar
-  callq   bar
-  .bundle_unlock
-# And here we'll need a 10-byte NOP + 1-byte NOP
-# CHECK:        30: callq
-# CHECK:        35: nop
-# CHECK:        3f: nop
-# CHECK-NEXT:   40: callq
-# CHECK-NEXT:   45: callq
-
-
-
diff --git a/llvm/test/MC/X86/AlignedBundling/relax-at-bundle-end.s b/llvm/test/MC/X86/AlignedBundling/relax-at-bundle-end.s
deleted file mode 100644
index 2c510f796e19f..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/relax-at-bundle-end.s
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
-
-# Test that an instruction near a bundle end gets properly padded
-# after it is relaxed.
-.text
-foo:
-        .bundle_align_mode 5
-        .rept 29
-        push %rax
-        .endr
-# CHECK: 1c: push
-# CHECK: 1d: nop
-# CHECK: 20: jne
-        jne 0x100
-
diff --git a/llvm/test/MC/X86/AlignedBundling/relax-in-bundle-group.s b/llvm/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
deleted file mode 100644
index 2419a02f9ab74..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/relax-in-bundle-group.s
+++ /dev/null
@@ -1,44 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d - | FileCheck %s
-
-# Test that instructions inside bundle-locked groups are relaxed even if their
-# fixup is short enough not to warrant relaxation on its own.
-
-  .text
-foo:
-  .bundle_align_mode 4
-  pushq   %rbp
-
-  movl    %edi, %ebx
-  callq   bar
-  movl    %eax, %r14d
-  imull   $17, %ebx, %ebp
-  movl    %ebx, %edi
-  callq   bar
-  cmpl    %r14d, %ebp
-  .bundle_lock
-
-  jle     .L_ELSE
-# This group would've started at 0x18 and is too long, so a chunky NOP padding
-# is inserted to push it to 0x20.
-# CHECK: 18: {{[a-f0-9 ]+}} nopl
-
-# The long encoding for JLE should be used here even though its target is close
-# CHECK-NEXT: 20: 0f 8e
-
-  addl    %ebp, %eax
-
-  jmp     .L_RET
-# Same for the JMP
-# CHECK: 28: e9
-
-  .bundle_unlock
-
-.L_ELSE:
-  imull   %ebx, %eax
-.L_RET:
-
-  popq    %rbx
-
diff --git a/llvm/test/MC/X86/AlignedBundling/rodata-section.s b/llvm/test/MC/X86/AlignedBundling/rodata-section.s
deleted file mode 100644
index 6c2b41a6f8034..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/rodata-section.s
+++ /dev/null
@@ -1,30 +0,0 @@
-# RUN: llvm-mc -triple=i686-nacl -filetype=obj %s -o - \
-# RUN:    | llvm-objdump --no-print-imm-hex -d --no-show-raw-insn - | FileCheck %s
-# RUN: llvm-mc -triple=i686-nacl -filetype=obj -mc-relax-all %s -o - \
-# RUN:    | llvm-objdump --no-print-imm-hex -d --no-show-raw-insn - | FileCheck %s
-
-  .bundle_align_mode 5
-  .text
-  .align	32, 0x90
-# CHECK: 0: movl $14, 8(%esp)
-  movl	$.str2, 8(%esp)
-# CHECK: 8: movl $7, 4(%esp)
-  movl	$.str1, 4(%esp)
-# CHECK: 10: movl $0, (%esp)
-  movl	$.str, (%esp)
-
-  .type	.str,@object
-  .section	.rodata,"a",@progbits
-.str:
-  .asciz	"hello1"
-  .size	.str, 7
-
-  .type	.str1,@object
-.str1:
-  .asciz	"hello2"
-  .size	.str1, 7
-
-  .type	.str2,@object
-.str2:
-  .asciz	"hello3"
-  .size	.str2, 7
diff --git a/llvm/test/MC/X86/AlignedBundling/section-alignment.s b/llvm/test/MC/X86/AlignedBundling/section-alignment.s
deleted file mode 100644
index d5277d194576e..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/section-alignment.s
+++ /dev/null
@@ -1,23 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
-# RUN:   | llvm-readobj --sections - | FileCheck %s
-
-# Test that bundle-aligned sections with instructions are aligned
-
-  .bundle_align_mode 5
-# CHECK: Sections
-# Check that the empty .text section has the default alignment
-# CHECK-LABEL: Name: .text
-# CHECK-NOT: Name
-# CHECK: AddressAlignment: 4
-
-  .section text1, "x"
-  imull $17, %ebx, %ebp
-# CHECK-LABEL: Name: text1
-# CHECK-NOT: Name
-# CHECK: AddressAlignment: 32
-
-  .section text2, "x"
-  imull $17, %ebx, %ebp
-# CHECK-LABEL: Name: text2
-# CHECK-NOT: Name
-# CHECK: AddressAlignment: 32
diff --git a/llvm/test/MC/X86/AlignedBundling/single-inst-bundling.s b/llvm/test/MC/X86/AlignedBundling/single-inst-bundling.s
deleted file mode 100644
index 1bcd010240ae3..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/single-inst-bundling.s
+++ /dev/null
@@ -1,52 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-OPT %s
-# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \
-# RUN:   | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-RELAX %s
-
-# Test simple NOP insertion for single instructions.
-
-  .text
-foo:
-  # Will be bundle-aligning to 16 byte boundaries
-  .bundle_align_mode 4
-  pushq   %rbp
-  pushq   %r14
-  pushq   %rbx
-
-  movl    %edi, %ebx
-  callq   bar
-  movl    %eax, %r14d
-
-  imull   $17, %ebx, %ebp
-# This imull is 3 bytes long and should have started at 0xe, so two bytes
-# of nop padding are inserted instead and it starts at 0x10
-# CHECK:          nop
-# CHECK-NEXT:     10: imull
-
-  movl    %ebx, %edi
-  callq   bar
-  cmpl    %r14d, %ebp
-# CHECK-RELAX:   nopl
-  jle     .L_ELSE
-# Due to the padding that's inserted before the addl, the jump target
-# becomes farther by one byte.
-# CHECK-OPT:     jle 0x24
-# CHECK-RELAX:   jle 0x2d
-
-  addl    %ebp, %eax
-# CHECK-OPT:     nop
-# CHECK-OPT-NEXT:20: addl
-# CHECK-RELAX:   26: addl
-
-  jmp     .L_RET
-.L_ELSE:
-  imull   %ebx, %eax
-.L_RET:
-  ret
-
-# Just verifying that data fills don't drive bundling crazy
-  .data
-  .byte 40
-  .byte 98
-
-
diff --git a/llvm/test/MC/X86/AlignedBundling/switch-section-locked-error.s b/llvm/test/MC/X86/AlignedBundling/switch-section-locked-error.s
deleted file mode 100644
index 6ea3c36beb1cc..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/switch-section-locked-error.s
+++ /dev/null
@@ -1,16 +0,0 @@
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
-
-# This test invokes .bundle_lock and then switches to a different section
-# w/o the appropriate unlock.
-
-# CHECK: ERROR: Unterminated .bundle_lock
-
-  .bundle_align_mode 3
-  .section text1, "x"
-  imull $17, %ebx, %ebp
-  .bundle_lock
-  imull $17, %ebx, %ebp
-
-  .section text2, "x"
-  imull $17, %ebx, %ebp
-
diff --git a/llvm/test/MC/X86/AlignedBundling/unlock-without-lock-error.s b/llvm/test/MC/X86/AlignedBundling/unlock-without-lock-error.s
deleted file mode 100644
index 811ef95a451d4..0000000000000
--- a/llvm/test/MC/X86/AlignedBundling/unlock-without-lock-error.s
+++ /dev/null
@@ -1,11 +0,0 @@
-# RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
-
-# .bundle_unlock can't come without a .bundle_lock before it
-
-# CHECK: ERROR: .bundle_unlock without matching lock
-
-  .bundle_align_mode 3
-  imull $17, %ebx, %ebp
-  .bundle_unlock
-
-
diff --git a/llvm/test/MC/X86/align-branch-bundle.s b/llvm/test/MC/X86/align-branch-bundle.s
deleted file mode 100644
index aba90f02a8a69..0000000000000
--- a/llvm/test/MC/X86/align-branch-bundle.s
+++ /dev/null
@@ -1,21 +0,0 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown --x86-align-branch-boundary=16 --x86-align-branch=fused+jcc --mc-relax-all %s | llvm-objdump --no-print-imm-hex -d --no-show-raw-insn - | FileCheck %s
-
-# Check using option --x86-align-branch-boundary=16 --x86-align-branch=fused+jcc --mc-relax-all with bundle won't make code crazy
-
-# CHECK:            0:       pushq    %rbp
-# CHECK-NEXT:       1:       testq    $2, %rdx
-# CHECK-NEXT:       8:       jne
-# CHECK-NEXT:       e:       nop
-# CHECK-NEXT:      10:       jle
-
-    .text
-    .p2align 4
-foo:
-  push %rbp
-  # Will be bundle-aligning to 8 byte boundaries
-  .bundle_align_mode 3
-  test $2, %rdx
-  jne   foo
-# This jle is 6 bytes long and should have started at 0xe, so two bytes
-# of nop padding are inserted instead and it starts at 0x10
-  jle   foo
diff --git a/llvm/test/MC/X86/reloc-directive-elf-32.s b/llvm/test/MC/X86/reloc-directive-elf-32.s
index d4b612ebfcefc..d3112dd5f7daf 100644
--- a/llvm/test/MC/X86/reloc-directive-elf-32.s
+++ b/llvm/test/MC/X86/reloc-directive-elf-32.s
@@ -4,16 +4,7 @@
 # RUN: llvm-readobj -r %t | FileCheck %s
 # RUN: llvm-readelf -x .data %t | FileCheck --check-prefix=HEX %s
 
-# PRINT:      .reloc 2, R_386_NONE, .data
-# PRINT-NEXT: .reloc 1, R_386_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_386_NONE, 8
-# PRINT-NEXT: .reloc 0, R_386_32, .data+2
-# PRINT-NEXT: .reloc 0, R_386_IRELATIVE, foo+3
-# PRINT-NEXT: .reloc 0, R_386_GOT32X, 5
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
+# PRINT:      .reloc {{.*}}+2, R_386_NONE, .data
 
 # X86 relocations use the Elf32_Rel format. Addends are neither stored in the
 # relocation entries nor applied in the referenced locations.
@@ -31,20 +22,20 @@
 # HEX: 0x00000000 00000000 00000000
 
 .text
+  .reloc .+2, R_386_NONE, .data
+  .reloc .+1, R_386_NONE, foo+4
+  .reloc .+0, R_386_NONE, 8
+  .reloc .+0, R_386_32, .data+2
+  .reloc .+0, R_386_IRELATIVE, foo+3
+  .reloc .+0, R_386_GOT32X, 5
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
   ret
   nop
   nop
-  .reloc 2, R_386_NONE, .data
-  .reloc 1, R_386_NONE, foo+4
-  .reloc 0, R_386_NONE, 8
-  .reloc 0, R_386_32, .data+2
-  .reloc 0, R_386_IRELATIVE, foo+3
-  .reloc 0, R_386_GOT32X, 5
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/X86/reloc-directive-elf-64.s b/llvm/test/MC/X86/reloc-directive-elf-64.s
index e0a1a5730597f..d6b8db98d5d08 100644
--- a/llvm/test/MC/X86/reloc-directive-elf-64.s
+++ b/llvm/test/MC/X86/reloc-directive-elf-64.s
@@ -3,18 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux-musl %s -o %t
 # RUN: llvm-readobj -r %t | FileCheck %s
 
-# PRINT:      .reloc 2, R_X86_64_NONE, .data
-# PRINT-NEXT: .reloc 1, R_X86_64_NONE, foo+4
-# PRINT-NEXT: .reloc 0, R_X86_64_NONE, 8
-# PRINT-NEXT: .reloc 0, R_X86_64_64, .data+2
-# PRINT-NEXT: .reloc 0, R_X86_64_GOTPCRELX, foo+3
-# PRINT-NEXT: .reloc 0, R_X86_64_REX_GOTPCRELX, 5
-# PRINT-NEXT: .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7
-# PRINT:      .reloc 0, BFD_RELOC_NONE, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9
-# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9
+# PRINT:      .reloc {{.*}}+2, R_X86_64_NONE, .data
 
 # CHECK:      0x2 R_X86_64_NONE .data 0x0
 # CHECK-NEXT: 0x1 R_X86_64_NONE foo 0x4
@@ -30,22 +19,22 @@
 # CHECK-NEXT: 0x0 R_X86_64_64 - 0x9
 
 .text
+  .reloc .+2, R_X86_64_NONE, .data
+  .reloc .+1, R_X86_64_NONE, foo+4
+  .reloc .+0, R_X86_64_NONE, 8
+  .reloc .+0, R_X86_64_64, .data+2
+  .reloc .+0, R_X86_64_GOTPCRELX, foo+3
+  .reloc .+0, R_X86_64_REX_GOTPCRELX, 5
+  .reloc .+0, R_X86_64_CODE_4_GOTPCRELX, 7
+
+  .reloc .+0, BFD_RELOC_NONE, 9
+  .reloc .+0, BFD_RELOC_8, 9
+  .reloc .+0, BFD_RELOC_16, 9
+  .reloc .+0, BFD_RELOC_32, 9
+  .reloc .+0, BFD_RELOC_64, 9
   ret
   nop
   nop
-  .reloc 2, R_X86_64_NONE, .data
-  .reloc 1, R_X86_64_NONE, foo+4
-  .reloc 0, R_X86_64_NONE, 8
-  .reloc 0, R_X86_64_64, .data+2
-  .reloc 0, R_X86_64_GOTPCRELX, foo+3
-  .reloc 0, R_X86_64_REX_GOTPCRELX, 5
-  .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7
-
-  .reloc 0, BFD_RELOC_NONE, 9
-  .reloc 0, BFD_RELOC_8, 9
-  .reloc 0, BFD_RELOC_16, 9
-  .reloc 0, BFD_RELOC_32, 9
-  .reloc 0, BFD_RELOC_64, 9
 
 .data
 .globl foo
diff --git a/llvm/test/MC/X86/reloc-directive.s b/llvm/test/MC/X86/reloc-directive.s
index 5f4fc2394f5e7..124dc06951122 100644
--- a/llvm/test/MC/X86/reloc-directive.s
+++ b/llvm/test/MC/X86/reloc-directive.s
@@ -8,16 +8,16 @@
 # RUN:     FileCheck -check-prefix=OBJ-64 %s
 	.text
 foo:
+	.reloc .+4, dir32,    foo          # ASM: .reloc {{.*}}+4, dir32, foo
+	.reloc .+0, secrel32, foo+4        # ASM: .reloc {{.*}}+0, secrel32, foo+4
+	.reloc .+8, secidx,   foo+8        # ASM: .reloc {{.*}}+8, secidx, foo+8
+	.reloc .+12, dir32,   foo@secrel32 # ASM: .reloc {{.*}}+12, dir32, foo@SECREL32
+	.reloc .+16, dir32,   foo@imgrel   # ASM: .reloc {{.*}}+16, dir32, foo@IMGREL
 	.long 0
 	.long 0
 	.long 0
 	.long 0
 	.long 0
-	.reloc 4, dir32,    foo          # ASM: .reloc 4, dir32, foo
-	.reloc 0, secrel32, foo+4        # ASM: .reloc 0, secrel32, foo+4
-	.reloc 8, secidx,   foo+8        # ASM: .reloc 8, secidx, foo+8
-	.reloc 12, dir32,   foo@secrel32 # ASM: .reloc 12, dir32, foo@SECREL32
-	.reloc 16, dir32,   foo@imgrel   # ASM: .reloc 16, dir32, foo@IMGREL
 
 # OBJ-32-LABEL: Name: .text
 # OBJ-32:       0000: 04000000 00000000 00000000
diff --git a/llvm/test/Other/opt-bisect-new-pass-manager.ll b/llvm/test/Other/opt-bisect-new-pass-manager.ll
index 01dad705ec362..8f8078d4d8409 100644
--- a/llvm/test/Other/opt-bisect-new-pass-manager.ll
+++ b/llvm/test/Other/opt-bisect-new-pass-manager.ll
@@ -11,84 +11,84 @@
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MODULE-PASS
-; CHECK-MODULE-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-MODULE-PASS: BISECT: running pass (1) inferattrs on [module]
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=0 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-MODULE-PASS
-; CHECK-LIMIT-MODULE-PASS: BISECT: NOT running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-LIMIT-MODULE-PASS: BISECT: NOT running pass (1) inferattrs on [module]
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-REQUIRED-PASS
-; CHECK-REQUIRED-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-REQUIRED-PASS: BISECT: running pass (1) inferattrs on [module]
 ; CHECK-REQUIRED-PASS-NOT: BISECT: {{.*}}VerifierPass
 ; CHECK-REQUIRED-PASS: Running pass: VerifierPass
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=inferattrs -opt-bisect-limit=0 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-REQUIRED-PASS
-; CHECK-LIMIT-REQUIRED-PASS: BISECT: NOT running pass (1) InferFunctionAttrsPass on [module]
+; CHECK-LIMIT-REQUIRED-PASS: BISECT: NOT running pass (1) inferattrs on [module]
 ; CHECK-LIMIT-REQUIRED-PASS-NOT: BISECT: {{.*}}VerifierPass
 ; CHECK-LIMIT-REQUIRED-PASS: Running pass: VerifierPass
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=early-cse -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS
-; CHECK-FUNCTION-PASS: BISECT: running pass (1) EarlyCSEPass on f1
-; CHECK-FUNCTION-PASS: BISECT: running pass (2) EarlyCSEPass on f2
-; CHECK-FUNCTION-PASS: BISECT: running pass (3) EarlyCSEPass on f3
-; CHECK-FUNCTION-PASS: BISECT: running pass (4) EarlyCSEPass on f4
+; CHECK-FUNCTION-PASS: BISECT: running pass (1) early-cse on f1
+; CHECK-FUNCTION-PASS: BISECT: running pass (2) early-cse on f2
+; CHECK-FUNCTION-PASS: BISECT: running pass (3) early-cse on f3
+; CHECK-FUNCTION-PASS: BISECT: running pass (4) early-cse on f4
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=early-cse -opt-bisect-limit=2 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-FUNCTION-PASS
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (1) EarlyCSEPass on f1
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (2) EarlyCSEPass on f2
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (3) EarlyCSEPass on f3
-; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (4) EarlyCSEPass on f4
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (1) early-cse on f1
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (2) early-cse on f2
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (3) early-cse on f3
+; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (4) early-cse on f4
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=function-attrs -opt-bisect-limit=-1 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-CGSCC-PASS
-; CHECK-CGSCC-PASS: BISECT: running pass (1) PostOrderFunctionAttrsPass on (f1)
-; CHECK-CGSCC-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f2)
-; CHECK-CGSCC-PASS: BISECT: running pass (3) PostOrderFunctionAttrsPass on (f3)
-; CHECK-CGSCC-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f4)
+; CHECK-CGSCC-PASS: BISECT: running pass (1) function-attrs on (f1)
+; CHECK-CGSCC-PASS: BISECT: running pass (2) function-attrs on (f2)
+; CHECK-CGSCC-PASS: BISECT: running pass (3) function-attrs on (f3)
+; CHECK-CGSCC-PASS: BISECT: running pass (4) function-attrs on (f4)
 
 ; RUN: opt -disable-output -disable-verify \
 ; RUN:     -passes=function-attrs -opt-bisect-limit=3 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-CGSCC-PASS
-; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (1) PostOrderFunctionAttrsPass on (f1)
-; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f2)
-; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (3) PostOrderFunctionAttrsPass on (f3)
-; CHECK-LIMIT-CGSCC-PASS: BISECT: NOT running pass (4) PostOrderFunctionAttrsPass on (f4)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (1) function-attrs on (f1)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (2) function-attrs on (f2)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (3) function-attrs on (f3)
+; CHECK-LIMIT-CGSCC-PASS: BISECT: NOT running pass (4) function-attrs on (f4)
 
 ; RUN: opt -disable-output -disable-verify -opt-bisect-limit=-1 \
 ; RUN:     -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MULTI-PASS
-; CHECK-MULTI-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
-; CHECK-MULTI-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f1)
-; CHECK-MULTI-PASS: BISECT: running pass (3) EarlyCSEPass on f1
-; CHECK-MULTI-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f2)
-; CHECK-MULTI-PASS: BISECT: running pass (5) EarlyCSEPass on f2
-; CHECK-MULTI-PASS: BISECT: running pass (6) PostOrderFunctionAttrsPass on (f3)
-; CHECK-MULTI-PASS: BISECT: running pass (7) EarlyCSEPass on f3
-; CHECK-MULTI-PASS: BISECT: running pass (8) PostOrderFunctionAttrsPass on (f4)
-; CHECK-MULTI-PASS: BISECT: running pass (9) EarlyCSEPass on f4
+; CHECK-MULTI-PASS: BISECT: running pass (1) inferattrs on [module]
+; CHECK-MULTI-PASS: BISECT: running pass (2) function-attrs on (f1)
+; CHECK-MULTI-PASS: BISECT: running pass (3) early-cse on f1
+; CHECK-MULTI-PASS: BISECT: running pass (4) function-attrs on (f2)
+; CHECK-MULTI-PASS: BISECT: running pass (5) early-cse on f2
+; CHECK-MULTI-PASS: BISECT: running pass (6) function-attrs on (f3)
+; CHECK-MULTI-PASS: BISECT: running pass (7) early-cse on f3
+; CHECK-MULTI-PASS: BISECT: running pass (8) function-attrs on (f4)
+; CHECK-MULTI-PASS: BISECT: running pass (9) early-cse on f4
 
 ; RUN: opt -disable-output -disable-verify -opt-bisect-limit=7 \
 ; RUN:     -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-LIMIT-MULTI-PASS
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module]
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f1)
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (3) EarlyCSEPass on f1
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f2)
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (5) EarlyCSEPass on f2
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (6) PostOrderFunctionAttrsPass on (f3)
-; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (7) EarlyCSEPass on f3
-; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (8) PostOrderFunctionAttrsPass on (f4)
-; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (9) EarlyCSEPass on f4
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (1) inferattrs on [module]
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (2) function-attrs on (f1)
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (3) early-cse on f1
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (4) function-attrs on (f2)
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (5) early-cse on f2
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (6) function-attrs on (f3)
+; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (7) early-cse on f3
+; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (8) function-attrs on (f4)
+; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (9) early-cse on f4
 
 ; Make sure we don't skip writing the output to stdout.
 ; RUN: opt %s -opt-bisect-limit=0 -passes=early-cse | opt -S | FileCheck %s -check-prefix=CHECK-OUTPUT
diff --git a/llvm/test/Other/opt-disable.ll b/llvm/test/Other/opt-disable.ll
new file mode 100644
index 0000000000000..4506042215cbf
--- /dev/null
+++ b/llvm/test/Other/opt-disable.ll
@@ -0,0 +1,91 @@
+; This test uses the same IR functions of the opt-bisect test
+; but it checks the correctness of the -opt-disable flag.
+; -opt-disable-enable-verbosity is required to have output.
+
+; RUN: opt -disable-output -disable-verify \
+; RUN:     -opt-disable-enable-verbosity \
+; RUN:     -passes=inferattrs -opt-disable=inferattrs %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-MODULE-PASS
+; CHECK-MODULE-PASS: OptDisable: NOT running pass inferattrs on [module]
+
+; RUN: opt -disable-output -disable-verify \
+; RUN:     -opt-disable-enable-verbosity \
+; RUN:     -passes=sroa -opt-disable=sroa %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f1
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f2
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f3
+; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f4
+
+; RUN: opt -disable-output -disable-verify \
+; RUN:     -opt-disable=inferattrs,function-attrs  \
+; RUN:     -opt-disable-enable-verbosity \
+; RUN:     -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-MULTI-PASS
+; CHECK-MULTI-PASS: OptDisable: NOT running pass inferattrs on [module]
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f1)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f1
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f2)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f2
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f3)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f3
+; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f4)
+; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f4
+
+declare i32 @g()
+
+define void @f1(i1 %arg) {
+entry:
+  br label %loop.0
+loop.0:
+  br i1 %arg, label %loop.0.0, label %loop.1
+loop.0.0:
+  br i1 %arg, label %loop.0.0, label %loop.0.1
+loop.0.1:
+  br i1 %arg, label %loop.0.1, label %loop.0
+loop.1:
+  br i1 %arg, label %loop.1, label %loop.1.bb1
+loop.1.bb1:
+  br i1 %arg, label %loop.1, label %loop.1.bb2
+loop.1.bb2:
+  br i1 %arg, label %end, label %loop.1.0
+loop.1.0:
+  br i1 %arg, label %loop.1.0, label %loop.1
+end:
+  ret void
+}
+
+define i32 @f2() {
+entry:
+  ret i32 0
+}
+
+define i32 @f3() {
+entry:
+  %temp = call i32 @g()
+  %icmp = icmp ugt i32 %temp, 2
+  br i1 %icmp, label %bb.true, label %bb.false
+bb.true:
+  %temp2 = call i32 @f2()
+  ret i32 %temp2
+bb.false:
+  ret i32 0
+}
+
+define void @f4(i1 %arg) {
+entry:
+  %i = alloca i32, align 4
+  call void @llvm.lifetime.start(i64 4, ptr %i)
+  br label %for.cond
+
+for.cond:
+  br i1 %arg, label %for.body, label %for.end
+
+for.body:
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, ptr nocapture)
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index a0061afab1db0..579e3c7dd62ab 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -116,19 +116,40 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:   RTLIB::Unsupported
 // CHECK-NEXT: };
 // CHECK-EMPTY:
-// CHECK-NEXT: const char *const llvm::RTLIB::RuntimeLibcallsInfo::LibCallImplNames[RTLIB::NumLibcallImpls] = {
-// CHECK-NEXT: nullptr, // RTLIB::Unsupported
-// CHECK-NEXT: "___memcpy", // RTLIB::___memcpy
-// CHECK-NEXT: "___memset", // RTLIB::___memset
-// CHECK-NEXT: "__ashlsi3", // RTLIB::__ashlsi3
-// CHECK-NEXT: "__lshrdi3", // RTLIB::__lshrdi3
-// CHECK-NEXT: "bzero", // RTLIB::bzero
-// CHECK-NEXT: "calloc", // RTLIB::calloc
-// CHECK-NEXT: "sqrtl", // RTLIB::sqrtl_f80
-// CHECK-NEXT: "sqrtl", // RTLIB::sqrtl_f128
+// CHECK-EMPTY:
+// CHECK-NEXT: #ifdef __GNUC__
+// CHECK-NEXT: #pragma GCC diagnostic push
+// CHECK-NEXT: #pragma GCC diagnostic ignored "-Woverlength-strings"
+// CHECK-NEXT: #endif
+// CHECK-NEXT:  constexpr char RTLIB::RuntimeLibcallsInfo::RuntimeLibcallImplNameTableStorage[] =
+// CHECK-NEXT:   "\0"
+// CHECK-NEXT:   "___memcpy\0"
+// CHECK-NEXT:   "___memset\0"
+// CHECK-NEXT:   "__ashlsi3\0"
+// CHECK-NEXT:   "__lshrdi3\0"
+// CHECK-NEXT:   "bzero\0"
+// CHECK-NEXT:   "calloc\0"
+// CHECK-NEXT:   "sqrtl\0"
+// CHECK-NEXT:   ;
+// CHECK-NEXT: #ifdef __GNUC__
+// CHECK-NEXT: #pragma GCC diagnostic pop
+// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: const llvm::StringTable
+// CHECK-NEXT: RTLIB::RuntimeLibcallsInfo::RuntimeLibcallImplNameTable = RuntimeLibcallImplNameTableStorage;
+// CHECK-EMPTY:
+// CHECK-NEXT: const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
+// CHECK-NEXT:   0, //
+// CHECK-NEXT:   1, // ___memcpy
+// CHECK-NEXT:   11, // ___memset
+// CHECK-NEXT:   21, // __ashlsi3
+// CHECK-NEXT:   31, // __lshrdi3
+// CHECK-NEXT:   41, // bzero
+// CHECK-NEXT:   47, // calloc
+// CHECK-NEXT:   54, // sqrtl
+// CHECK-NEXT:   54, // sqrtl
 // CHECK-NEXT: };
-
-// CHECK: const RTLIB::Libcall llvm::RTLIB::RuntimeLibcallsInfo::ImplToLibcall[RTLIB::NumLibcallImpls] = {
+// CHECK-NEXT: const RTLIB::Libcall llvm::RTLIB::RuntimeLibcallsInfo::ImplToLibcall[RTLIB::NumLibcallImpls] = {
 // CHECK-NEXT: RTLIB::UNKNOWN_LIBCALL, // RTLIB::Unsupported
 // CHECK-NEXT: RTLIB::MEMCPY, // RTLIB::___memcpy
 // CHECK-NEXT: RTLIB::MEMSET, // RTLIB::___memset
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 1d2bd51204e4f..3eda077eeabf7 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -53,6 +53,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  #include "llvm/ADT/ArrayRef.h"
 // CHECK-NEXT:  #include "llvm/ADT/BitmaskEnum.h"
+// CHECK-NEXT:  #include "llvm/ADT/Sequence.h"
 // CHECK-NEXT:  #include "llvm/ADT/StringRef.h"
 // CHECK-NEXT:  #include "llvm/Frontend/Directive/Spelling.h"
 // CHECK-NEXT:  #include "llvm/Support/Compiler.h"
@@ -66,22 +67,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
 // CHECK-NEXT:    Block,
+// CHECK-NEXT:    First_ = Block,
 // CHECK-NEXT:    Declaration,
 // CHECK-NEXT:    Delimited,
 // CHECK-NEXT:    Loop,
 // CHECK-NEXT:    None,
 // CHECK-NEXT:    Separating,
+// CHECK-NEXT:    Last_ = Separating,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Category {
 // CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    First_ = Declarative,
 // CHECK-NEXT:    Executable,
 // CHECK-NEXT:    Informational,
 // CHECK-NEXT:    Meta,
 // CHECK-NEXT:    Subsidiary,
 // CHECK-NEXT:    Utility,
+// CHECK-NEXT:    Last_ = Utility,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
@@ -96,6 +101,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
+// CHECK-NEXT:    First_ = TDLD_dira,
+// CHECK-NEXT:    Last_ = TDLD_dira,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Directive_enumSize = 1;
@@ -104,8 +111,10 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Clause {
 // CHECK-NEXT:    TDLC_clausea,
+// CHECK-NEXT:    First_ = TDLC_clausea,
 // CHECK-NEXT:    TDLC_clauseb,
 // CHECK-NEXT:    TDLC_clausec,
+// CHECK-NEXT:    Last_ = TDLC_clausec,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Clause_enumSize = 3;
@@ -151,6 +160,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI StringRef getTdlAKindName(AKind x);
 // CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace tdl
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Category> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Directive> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index 3a64bb3900a31..a25197c3efd93 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -46,6 +46,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #define LLVM_Tdl_INC
 // CHECK-EMPTY:
 // CHECK-NEXT:  #include "llvm/ADT/ArrayRef.h"
+// CHECK-NEXT:  #include "llvm/ADT/Sequence.h"
 // CHECK-NEXT:  #include "llvm/ADT/StringRef.h"
 // CHECK-NEXT:  #include "llvm/Frontend/Directive/Spelling.h"
 // CHECK-NEXT:  #include "llvm/Support/Compiler.h"
@@ -57,22 +58,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
 // CHECK-NEXT:    Block,
+// CHECK-NEXT:    First_ = Block,
 // CHECK-NEXT:    Declaration,
 // CHECK-NEXT:    Delimited,
 // CHECK-NEXT:    Loop,
 // CHECK-NEXT:    None,
 // CHECK-NEXT:    Separating,
+// CHECK-NEXT:    Last_ = Separating,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Category {
 // CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    First_ = Declarative,
 // CHECK-NEXT:    Executable,
 // CHECK-NEXT:    Informational,
 // CHECK-NEXT:    Meta,
 // CHECK-NEXT:    Subsidiary,
 // CHECK-NEXT:    Utility,
+// CHECK-NEXT:    Last_ = Utility,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
@@ -87,15 +92,19 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
+// CHECK-NEXT:    First_ = TDLD_dira,
+// CHECK-NEXT:    Last_ = TDLD_dira,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Directive_enumSize = 1;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Clause {
 // CHECK-NEXT:    TDLC_clausea,
+// CHECK-NEXT:    First_ = TDLC_clausea,
 // CHECK-NEXT:    TDLC_clauseb,
 // CHECK-NEXT:    TDLC_clausec,
 // CHECK-NEXT:    TDLC_claused,
+// CHECK-NEXT:    Last_ = TDLC_claused,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Clause_enumSize = 4;
@@ -124,6 +133,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);
 // CHECK-NEXT:  } // namespace tdl
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Category> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Directive> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
diff --git a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll
index 2306b72a0055f..40101595092b0 100644
--- a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll
@@ -87,9 +87,9 @@ loop_exit:
 }
 
 ; CHECK: define {{.*}} void @my_async_function.resume.0(
-; CHECK-NOT:  call void @llvm.lifetime.start.p0(i64 4, ptr %3)
-; CHECK:  br i1 %exitCond, label %loop_exit, label %loop
-; CHECK: lifetime.end
+; CHECK-NOT: llvm.lifetime
+; CHECK:  br i1 %exitCond, label %common.ret, label %loop
+; CHECK-NOT: llvm.lifetime
 ; CHECK: }
 
 declare { ptr, ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0i8p0i8p0i8p0i8s(i32, ptr, ptr, ...)
diff --git a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
index e390d4bdca632..303afc207c023 100644
--- a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
@@ -12,6 +12,6 @@ define ptr @undeclared_customalloc(i64 %size, i64 %align) {
   ret ptr %call
 }
 
-declare ptr @customalloc2(i64, i64) allockind("alloc") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
+declare ptr @customalloc2(i64, i64) allockind("alloc,uninitialized") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
 ; CHECK-DAG: declare ptr @customalloc2_zeroed(i64, i64) #[[CA2ATTR:[0-9]+]]
 ; CHECK-DAG: attributes #[[CA2ATTR]] = { allockind("alloc,zeroed") "alloc-family"="customalloc2" }
diff --git a/llvm/test/Transforms/InferFunctionAttrs/no-proto.ll b/llvm/test/Transforms/InferFunctionAttrs/no-proto.ll
index 12190735306dc..898d83f3fb8ae 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/no-proto.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/no-proto.ll
@@ -977,3 +977,6 @@ declare void @vsscanf(...)
 
 ; CHECK: declare void @write(...)
 declare void @write(...)
+
+; CHECK: declare double @cabs(...)
+declare double @cabs(...)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
index 538cc19f9722e..beb84362b7f92 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
@@ -76,4 +76,37 @@ sink:
   ret <2 x i64> %val.sink
 }
 
+; Crashed in IC PtrReplacer because an invalid select was generated with addrspace(4) and addrspace(5)
+; operands.
+define amdgpu_kernel void @select_addr4_addr5(ptr addrspace(4) byref([12 x i8]) align 16 %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @select_addr4_addr5(
+; CHECK-SAME: ptr addrspace(4) byref([12 x i8]) align 16 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca i32, i32 0, align 8, addrspace(5)
+  %alloca1 = alloca [12 x i8], align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca1, ptr addrspace(4) %arg, i64 0, i1 false)
+  %select = select i1 false, ptr addrspace(5) %alloca1, ptr addrspace(5) %alloca
+  call void @llvm.memcpy.p0.p5.i64(ptr null, ptr addrspace(5) %select, i64 0, i1 false)
+  ret void
+}
+
+; Same as above but with swapped operands on the select.
+define amdgpu_kernel void @select_addr4_addr5_swapped(ptr addrspace(4) byref([12 x i8]) align 16 %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @select_addr4_addr5_swapped(
+; CHECK-SAME: ptr addrspace(4) byref([12 x i8]) align 16 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca i32, i32 0, align 8, addrspace(5)
+  %alloca1 = alloca [12 x i8], align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca1, ptr addrspace(4) %arg, i64 0, i1 false)
+  %select = select i1 false, ptr addrspace(5) %alloca, ptr addrspace(5) %alloca1
+  call void @llvm.memcpy.p0.p5.i64(ptr null, ptr addrspace(5) %select, i64 0, i1 false)
+  ret void
+}
+
 declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0
diff --git a/llvm/test/Transforms/InstCombine/fold-icmp-without-constant-into-phi.ll b/llvm/test/Transforms/InstCombine/fold-icmp-without-constant-into-phi.ll
new file mode 100644
index 0000000000000..73fb93f3c9d3b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-icmp-without-constant-into-phi.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @test(i1 %cond, i64 %left, i64 %right) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: i1 [[COND:%.*]], i64 [[LEFT:%.*]], i64 [[RIGHT:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[RIGHT]], [[LEFT]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[END:.*]], label %[[COND_FALSE]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    ret i1 false
+;
+start:
+  br i1 %cond, label %cond.true, label %cond.false
+
+cond.true:
+  %cmp = icmp sgt i64 %right, %left
+  br i1 %cmp, label %end, label %cond.false
+
+cond.false:
+  %left_or_right = phi i64 [ %left, %start ], [ %right, %cond.true ]
+  %false = icmp sgt i64 %left_or_right, %left
+  br label %end
+
+end:
+  %result = phi i1 [ false, %cond.true ], [ %false, %cond.false ]
+  ret i1 %result
+}
+
+define i1 @test_commuted(i1 %cond, i64 %left, i64 %right) {
+; CHECK-LABEL: define i1 @test_commuted(
+; CHECK-SAME: i1 [[COND:%.*]], i64 [[LEFT:%.*]], i64 [[RIGHT:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[RIGHT]], [[LEFT]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[END:.*]], label %[[COND_FALSE]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    ret i1 false
+;
+start:
+  br i1 %cond, label %cond.true, label %cond.false
+
+cond.true:
+  %cmp = icmp sgt i64 %right, %left
+  br i1 %cmp, label %end, label %cond.false
+
+cond.false:
+  %left_or_right = phi i64 [ %left, %start ], [ %right, %cond.true ]
+  %false = icmp slt i64 %left, %left_or_right
+  br label %end
+
+end:
+  %result = phi i1 [ false, %cond.true ], [ %false, %cond.false ]
+  ret i1 %result
+}
+
+define i1 @test_with_wrong_icmp(i1 %cond, i64 %left, i64 %right) {
+; CHECK-LABEL: define i1 @test_with_wrong_icmp(
+; CHECK-SAME: i1 [[COND:%.*]], i64 [[LEFT:%.*]], i64 [[RIGHT:%.*]]) {
+; CHECK-NEXT:  [[START:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[RIGHT]], [[LEFT]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[END:.*]], label %[[COND_FALSE]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    [[LEFT_OR_RIGHT:%.*]] = phi i64 [ [[LEFT]], %[[START]] ], [ [[RIGHT]], %[[COND_TRUE]] ]
+; CHECK-NEXT:    [[FALSE:%.*]] = icmp sge i64 [[LEFT_OR_RIGHT]], [[LEFT]]
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ false, %[[COND_TRUE]] ], [ [[FALSE]], %[[COND_FALSE]] ]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  br i1 %cond, label %cond.true, label %cond.false
+
+cond.true:
+  %cmp = icmp sgt i64 %right, %left
+  br i1 %cmp, label %end, label %cond.false
+
+cond.false:
+  %left_or_right = phi i64 [ %left, %start ], [ %right, %cond.true ]
+  %false = icmp sge i64 %left_or_right, %left
+  br label %end
+
+end:
+  %result = phi i1 [ false, %cond.true ], [ %false, %cond.false ]
+  ret i1 %result
+}
+
+define i1 @test_with_unfoldable_phi(i1 %cond, i64 %left, i64 %right) {
+; CHECK-LABEL: define i1 @test_with_unfoldable_phi(
+; CHECK-SAME: i1 [[COND:%.*]], i64 [[LEFT:%.*]], i64 [[RIGHT:%.*]]) {
+; CHECK-NEXT:  [[START:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[RIGHT]], [[LEFT]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[END:.*]], label %[[COND_FALSE]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    [[LEFT_OR_RIGHT:%.*]] = phi i64 [ [[LEFT]], %[[START]] ], [ [[RIGHT]], %[[COND_TRUE]] ]
+; CHECK-NEXT:    [[FALSE:%.*]] = icmp sgt i64 [[LEFT_OR_RIGHT]], [[LEFT]]
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ false, %[[COND_TRUE]] ], [ [[FALSE]], %[[COND_FALSE]] ]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  br i1 %cond, label %cond.true, label %cond.false
+
+cond.true:
+  %cmp = icmp slt i64 %right, %left         ; slt instead of sgt here
+  br i1 %cmp, label %end, label %cond.false
+
+cond.false:
+  %left_or_right = phi i64 [ %left, %start ], [ %right, %cond.true ]
+  %false = icmp sgt i64 %left_or_right, %left
+  br label %end
+
+end:
+  %result = phi i1 [ false, %cond.true ], [ %false, %cond.false ]
+  ret i1 %result
+}
+
+define i1 @test_which_optimizes_to_select(i1 %cond, i64 %left, i64 %right) {
+; CHECK-LABEL: define i1 @test_which_optimizes_to_select(
+; CHECK-SAME: i1 [[COND:%.*]], i64 [[LEFT:%.*]], i64 [[RIGHT:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+; CHECK:       [[COND_TRUE]]:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp sgt i64 [[RIGHT]], [[LEFT]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[COND_FALSE]], label %[[END:.*]]
+; CHECK:       [[COND_FALSE]]:
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i1 [ false, %[[COND_TRUE]] ], [ [[COND]], %[[COND_FALSE]] ]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  br i1 %cond, label %cond.true, label %cond.false
+
+cond.true:
+  %cmp = icmp sle i64 %right, %left
+  br i1 %cmp, label %end, label %cond.false
+
+cond.false:
+  %left_or_right = phi i64 [ %left, %start ], [ %right, %cond.true ]
+  %false = icmp sgt i64 %left_or_right, %left
+  br label %end
+
+end:
+  %result = phi i1 [ false, %cond.true ], [ %false, %cond.false ]
+  ret i1 %result
+}
diff --git a/llvm/test/Transforms/InstCombine/freeze.ll b/llvm/test/Transforms/InstCombine/freeze.ll
index 9733f1b732c3f..3fedead2feab8 100644
--- a/llvm/test/Transforms/InstCombine/freeze.ll
+++ b/llvm/test/Transforms/InstCombine/freeze.ll
@@ -142,6 +142,17 @@ define i32 @early_freeze_test3(i32 %v1) {
   ret i32 %v4.fr
 }
 
+define i32 @early_freeze_test4(i32 %v1) {
+; CHECK-LABEL: @early_freeze_test4(
+; CHECK-NEXT:    [[V2_FR:%.*]] = freeze i32 [[V2:%.*]]
+; CHECK-NEXT:    [[V3:%.*]] = mul i32 [[V2_FR]], [[V2_FR]]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v2 = mul i32 %v1, %v1
+  %v2.fr = freeze i32 %v2
+  ret i32 %v2.fr
+}
+
 ; If replace all dominated uses of v to freeze(v).
 
 define void @freeze_dominated_uses_test1(i32 %v) {
diff --git a/llvm/test/Transforms/InstCombine/icmp-select.ll b/llvm/test/Transforms/InstCombine/icmp-select.ll
index a038731abbc48..c6c0ba385a6fd 100644
--- a/llvm/test/Transforms/InstCombine/icmp-select.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-select.ll
@@ -248,10 +248,9 @@ define i1 @icmp_select_implied_cond_relational_off_by_one(i8 %x, i8 %y) {
 
 define i1 @umin_seq_comparison(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umin_seq_comparison(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X:%.*]], 0
-; CHECK-NEXT:    [[CMP21:%.*]] = icmp ule i8 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMP2:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP21]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    [[Y:%.*]] = freeze i8 [[Y1:%.*]]
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp ule i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP21]]
 ;
   %min = call i8 @llvm.umin.i8(i8 %x, i8 %y)
   %cmp1 = icmp eq i8 %x, 0
diff --git a/llvm/test/Transforms/InstCombine/ptrauth-call.ll b/llvm/test/Transforms/InstCombine/ptrauth-call.ll
new file mode 100644
index 0000000000000..eefe593361c05
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ptrauth-call.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare i64 @f(i32)
+declare ptr @f2(i32)
+
+define i32 @test_ptrauth_call(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @f(i32 [[A0:%.*]])
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v0 = call i32 ptrauth(ptr @f, i32 0)(i32 %a0) [ "ptrauth"(i32 0, i64 0) ]
+  ret i32 %v0
+}
+
+define i32 @test_ptrauth_call_disc(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_disc(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @f(i32 [[A0:%.*]])
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v0 = call i32 ptrauth(ptr @f, i32 1, i64 5678)(i32 %a0) [ "ptrauth"(i32 1, i64 5678) ]
+  ret i32 %v0
+}
+
+@f_addr_disc.ref = constant ptr ptrauth(ptr @f, i32 1, i64 0, ptr @f_addr_disc.ref)
+
+define i32 @test_ptrauth_call_addr_disc(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_addr_disc(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @f(i32 [[A0:%.*]])
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v0 = call i32 ptrauth(ptr @f, i32 1, i64 0, ptr @f_addr_disc.ref)(i32 %a0) [ "ptrauth"(i32 1, i64 ptrtoint (ptr @f_addr_disc.ref to i64)) ]
+  ret i32 %v0
+}
+
+@f_both_disc.ref = constant ptr ptrauth(ptr @f, i32 1, i64 1234, ptr @f_both_disc.ref)
+
+define i32 @test_ptrauth_call_blend(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_blend(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @f(i32 [[A0:%.*]])
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @f_both_disc.ref to i64), i64 1234)
+  %v0 = call i32 ptrauth(ptr @f, i32 1, i64 1234, ptr @f_both_disc.ref)(i32 %a0) [ "ptrauth"(i32 1, i64 %v) ]
+  ret i32 %v0
+}
+
+define i64 @test_ptrauth_call_cast(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_cast(
+; CHECK-NEXT:    [[V0:%.*]] = call i64 @f2(i32 [[A0:%.*]])
+; CHECK-NEXT:    ret i64 [[V0]]
+;
+  %v0 = call i64 ptrauth(ptr @f2, i32 0)(i32 %a0) [ "ptrauth"(i32 0, i64 0) ]
+  ret i64 %v0
+}
+
+define i32 @test_ptrauth_call_mismatch_key(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_mismatch_key(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 ptrauth (ptr @f, i32 1, i64 5678)(i32 [[A0:%.*]]) [ "ptrauth"(i32 0, i64 5678) ]
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v0 = call i32 ptrauth(ptr @f, i32 1, i64 5678)(i32 %a0) [ "ptrauth"(i32 0, i64 5678) ]
+  ret i32 %v0
+}
+
+define i32 @test_ptrauth_call_mismatch_disc(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_mismatch_disc(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 ptrauth (ptr @f, i32 1, i64 5678)(i32 [[A0:%.*]]) [ "ptrauth"(i32 1, i64 0) ]
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v0 = call i32 ptrauth(ptr @f, i32 1, i64 5678)(i32 %a0) [ "ptrauth"(i32 1, i64 0) ]
+  ret i32 %v0
+}
+
+define i32 @test_ptrauth_call_mismatch_blend(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_mismatch_blend(
+; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @f_both_disc.ref to i64), i64 0)
+; CHECK-NEXT:    [[V0:%.*]] = call i32 ptrauth (ptr @f, i32 1, i64 1234, ptr @f_both_disc.ref)(i32 [[A0:%.*]]) [ "ptrauth"(i32 1, i64 [[V]]) ]
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @f_both_disc.ref to i64), i64 0)
+  %v0 = call i32 ptrauth(ptr @f, i32 1, i64 1234, ptr @f_both_disc.ref)(i32 %a0) [ "ptrauth"(i32 1, i64 %v) ]
+  ret i32 %v0
+}
+
+define i32 @test_ptrauth_call_mismatch_blend_addr(i32 %a0) {
+; CHECK-LABEL: @test_ptrauth_call_mismatch_blend_addr(
+; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @f_addr_disc.ref to i64), i64 1234)
+; CHECK-NEXT:    [[V0:%.*]] = call i32 ptrauth (ptr @f, i32 1, i64 1234, ptr @f_both_disc.ref)(i32 [[A0:%.*]]) [ "ptrauth"(i32 1, i64 [[V]]) ]
+; CHECK-NEXT:    ret i32 [[V0]]
+;
+  %v = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr @f_addr_disc.ref to i64), i64 1234)
+  %v0 = call i32 ptrauth(ptr @f, i32 1, i64 1234, ptr @f_both_disc.ref)(i32 %a0) [ "ptrauth"(i32 1, i64 %v) ]
+  ret i32 %v0
+}
+
+declare i64 @llvm.ptrauth.blend(i64, i64)
diff --git a/llvm/test/Transforms/InstCombine/ptrauth-intrinsics-call.ll b/llvm/test/Transforms/InstCombine/ptrauth-intrinsics-call.ll
new file mode 100644
index 0000000000000..5e597c9155f40
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ptrauth-intrinsics-call.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @test_ptrauth_call_sign(ptr %p) {
+; CHECK-LABEL: @test_ptrauth_call_sign(
+; CHECK-NEXT:    [[V3:%.*]] = call i32 [[P:%.*]]()
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = ptrtoint ptr %p to i64
+  %v1 = call i64 @llvm.ptrauth.sign(i64 %v0, i32 2, i64 5678)
+  %v2 = inttoptr i64 %v1 to ptr
+  %v3 = call i32 %v2() [ "ptrauth"(i32 2, i64 5678) ]
+  ret i32 %v3
+}
+
+define i32 @test_ptrauth_call_sign_otherbundle(ptr %p) {
+; CHECK-LABEL: @test_ptrauth_call_sign_otherbundle(
+; CHECK-NEXT:    [[V3:%.*]] = call i32 [[P:%.*]]() [ "somebundle"(ptr null), "otherbundle"(i64 0) ]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = ptrtoint ptr %p to i64
+  %v1 = call i64 @llvm.ptrauth.sign(i64 %v0, i32 2, i64 5678)
+  %v2 = inttoptr i64 %v1 to ptr
+  %v3 = call i32 %v2() [ "somebundle"(ptr null), "ptrauth"(i32 2, i64 5678), "otherbundle"(i64 0) ]
+  ret i32 %v3
+}
+
+define i32 @test_ptrauth_call_resign(ptr %p) {
+; CHECK-LABEL: @test_ptrauth_call_resign(
+; CHECK-NEXT:    [[V3:%.*]] = call i32 [[P:%.*]]() [ "ptrauth"(i32 1, i64 1234) ]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = ptrtoint ptr %p to i64
+  %v1 = call i64 @llvm.ptrauth.resign(i64 %v0, i32 1, i64 1234, i32 1, i64 5678)
+  %v2 = inttoptr i64 %v1 to ptr
+  %v3 = call i32 %v2() [ "ptrauth"(i32 1, i64 5678) ]
+  ret i32 %v3
+}
+
+define i32 @test_ptrauth_call_resign_blend(ptr %pp) {
+; CHECK-LABEL: @test_ptrauth_call_resign_blend(
+; CHECK-NEXT:    [[V01:%.*]] = load ptr, ptr [[PP:%.*]], align 8
+; CHECK-NEXT:    [[V6:%.*]] = call i32 [[V01]]() [ "ptrauth"(i32 1, i64 1234) ]
+; CHECK-NEXT:    ret i32 [[V6]]
+;
+  %v0 = load ptr, ptr %pp, align 8
+  %v1 = ptrtoint ptr %pp to i64
+  %v2 = ptrtoint ptr %v0 to i64
+  %v3 = call i64 @llvm.ptrauth.blend(i64 %v1, i64 5678)
+  %v4 = call i64 @llvm.ptrauth.resign(i64 %v2, i32 1, i64 1234, i32 1, i64 %v3)
+  %v5 = inttoptr i64 %v4 to ptr
+  %v6 = call i32 %v5() [ "ptrauth"(i32 1, i64 %v3) ]
+  ret i32 %v6
+}
+
+define i32 @test_ptrauth_call_resign_blend_2(ptr %pp) {
+; CHECK-LABEL: @test_ptrauth_call_resign_blend_2(
+; CHECK-NEXT:    [[V01:%.*]] = load ptr, ptr [[PP:%.*]], align 8
+; CHECK-NEXT:    [[V1:%.*]] = ptrtoint ptr [[PP]] to i64
+; CHECK-NEXT:    [[V3:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[V1]], i64 5678)
+; CHECK-NEXT:    [[V6:%.*]] = call i32 [[V01]]() [ "ptrauth"(i32 0, i64 [[V3]]) ]
+; CHECK-NEXT:    ret i32 [[V6]]
+;
+  %v0 = load ptr, ptr %pp, align 8
+  %v1 = ptrtoint ptr %pp to i64
+  %v2 = ptrtoint ptr %v0 to i64
+  %v3 = call i64 @llvm.ptrauth.blend(i64 %v1, i64 5678)
+  %v4 = call i64 @llvm.ptrauth.resign(i64 %v2, i32 0, i64 %v3, i32 0, i64 1234)
+  %v5 = inttoptr i64 %v4 to ptr
+  %v6 = call i32 %v5() [ "ptrauth"(i32 0, i64 1234) ]
+  ret i32 %v6
+}
+
+define i32 @test_ptrauth_call_resign_mismatch_key(ptr %p) {
+; CHECK-LABEL: @test_ptrauth_call_resign_mismatch_key(
+; CHECK-NEXT:    [[V0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[V1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[V0]], i32 1, i64 1234, i32 0, i64 5678)
+; CHECK-NEXT:    [[V2:%.*]] = inttoptr i64 [[V1]] to ptr
+; CHECK-NEXT:    [[V3:%.*]] = call i32 [[V2]]() [ "ptrauth"(i32 1, i64 5678) ]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = ptrtoint ptr %p to i64
+  %v1 = call i64 @llvm.ptrauth.resign(i64 %v0, i32 1, i64 1234, i32 0, i64 5678)
+  %v2 = inttoptr i64 %v1 to ptr
+  %v3 = call i32 %v2() [ "ptrauth"(i32 1, i64 5678) ]
+  ret i32 %v3
+}
+
+define i32 @test_ptrauth_call_resign_mismatch_disc(ptr %p) {
+; CHECK-LABEL: @test_ptrauth_call_resign_mismatch_disc(
+; CHECK-NEXT:    [[V0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[V1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[V0]], i32 1, i64 1234, i32 0, i64 9900)
+; CHECK-NEXT:    [[V2:%.*]] = inttoptr i64 [[V1]] to ptr
+; CHECK-NEXT:    [[V3:%.*]] = call i32 [[V2]]() [ "ptrauth"(i32 1, i64 5678) ]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = ptrtoint ptr %p to i64
+  %v1 = call i64 @llvm.ptrauth.resign(i64 %v0, i32 1, i64 1234, i32 0, i64 9900)
+  %v2 = inttoptr i64 %v1 to ptr
+  %v3 = call i32 %v2() [ "ptrauth"(i32 1, i64 5678) ]
+  ret i32 %v3
+}
+
+define i32 @test_ptrauth_call_resign_mismatch_blend(ptr %pp) {
+; CHECK-LABEL: @test_ptrauth_call_resign_mismatch_blend(
+; CHECK-NEXT:    [[V0:%.*]] = load ptr, ptr [[PP:%.*]], align 8
+; CHECK-NEXT:    [[V1:%.*]] = ptrtoint ptr [[PP]] to i64
+; CHECK-NEXT:    [[V2:%.*]] = ptrtoint ptr [[V0]] to i64
+; CHECK-NEXT:    [[V6:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[V1]], i64 5678)
+; CHECK-NEXT:    [[V4:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[V2]], i32 1, i64 1234, i32 1, i64 [[V6]])
+; CHECK-NEXT:    [[V5:%.*]] = inttoptr i64 [[V4]] to ptr
+; CHECK-NEXT:    [[V3:%.*]] = call i32 [[V5]]() [ "ptrauth"(i32 1, i64 [[V1]]) ]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = load ptr, ptr %pp, align 8
+  %v1 = ptrtoint ptr %pp to i64
+  %v2 = ptrtoint ptr %v0 to i64
+  %v3 = call i64 @llvm.ptrauth.blend(i64 %v1, i64 5678)
+  %v4 = call i64 @llvm.ptrauth.resign(i64 %v2, i32 1, i64 1234, i32 1, i64 %v3)
+  %v5 = inttoptr i64 %v4 to ptr
+  %v6 = call i32 %v5() [ "ptrauth"(i32 1, i64 %v1) ]
+  ret i32 %v6
+}
+
+define i32 @test_ptrauth_call_resign_changing_call_key(ptr %p) {
+; CHECK-LABEL: @test_ptrauth_call_resign_changing_call_key(
+; CHECK-NEXT:    [[V0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[V1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[V0]], i32 2, i64 1234, i32 1, i64 5678)
+; CHECK-NEXT:    [[V2:%.*]] = inttoptr i64 [[V1]] to ptr
+; CHECK-NEXT:    [[V3:%.*]] = call i32 [[V2]]() [ "ptrauth"(i32 1, i64 5678) ]
+; CHECK-NEXT:    ret i32 [[V3]]
+;
+  %v0 = ptrtoint ptr %p to i64
+  %v1 = call i64 @llvm.ptrauth.resign(i64 %v0, i32 2, i64 1234, i32 1, i64 5678)
+  %v2 = inttoptr i64 %v1 to ptr
+  %v3 = call i32 %v2() [ "ptrauth"(i32 1, i64 5678) ]
+  ret i32 %v3
+}
+
+declare i64 @llvm.ptrauth.sign(i64, i32, i64)
+declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
+declare i64 @llvm.ptrauth.blend(i64, i64)
diff --git a/llvm/test/Transforms/InstCombine/select-fixed-zero.ll b/llvm/test/Transforms/InstCombine/select-fixed-zero.ll
new file mode 100644
index 0000000000000..7f326d158776b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-fixed-zero.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+; (select (icmp x, 0, eq), 0, (umin x, y)) -> (umin x, y)
+define i64 @umin_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @umin_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[A:%.*]], i64 [[B_FR]])
+; CHECK-NEXT:    ret i64 [[UMIN]]
+;
+  %cond = icmp eq i64 %a, 0
+  %umin = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+  %select = select i1 %cond, i64 0, i64 %umin
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (mul x, y)) -> (mul x, y)
+define i64 @mul_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @mul_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[A:%.*]], [[B_FR]]
+; CHECK-NEXT:    ret i64 [[MUL]]
+;
+  %cond = icmp eq i64 %a, 0
+  %mul = mul i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %mul
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (mul x, y)) -> (mul x, y)
+define i64 @mul_select_comm(i64 %a, i64 %b) {
+; CHECK-LABEL: @mul_select_comm(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[B_FR]], [[A:%.*]]
+; CHECK-NEXT:    ret i64 [[MUL]]
+;
+  %cond = icmp eq i64 %a, 0
+  %mul = mul i64 %b, %a
+  %select = select i1 %cond, i64 0, i64 %mul
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (shl x, y)) -> (shl x, y)
+define i64 @shl_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @shl_select(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], [[B_FR:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], i64 0, i64 [[SHL]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp eq i64 %a, 0
+  %shl = shl i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %shl
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (and x, y)) -> (and x, y)
+define i64 @and_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @and_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[A:%.*]], [[B_FR]]
+; CHECK-NEXT:    ret i64 [[AND]]
+;
+  %cond = icmp eq i64 %a, 0
+  %and = and i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %and
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (and x, y)) -> (and x, y)
+define i64 @and_select_comm(i64 %a, i64 %b) {
+; CHECK-LABEL: @and_select_comm(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[B_FR]], [[A:%.*]]
+; CHECK-NEXT:    ret i64 [[AND]]
+;
+  %cond = icmp eq i64 %a, 0
+  %and = and i64 %b, %a
+  %select = select i1 %cond, i64 0, i64 %and
+  ret i64 %select
+}
+
+; (select (icmp x, 0, ne), (ashr x, y), 0) -> (ashr x, y)
+define i64 @ashr_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @ashr_select(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[A]], [[B_FR:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND_NOT]], i64 0, i64 [[ASHR]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp ne i64 0, %a
+  %ashr = ashr i64 %a, %b
+  %select = select i1 %cond, i64 %ashr, i64 0
+  ret i64 %select
+}
+
+; (select (icmp x, 0, ne), (lshr x, y), 0) -> (lshr x, y)
+define i64 @lshr_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @lshr_select(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i64 [[A]], [[B_FR:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND_NOT]], i64 0, i64 [[LSHR]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp ne i64 0, %a
+  %lshr = lshr i64 %a, %b
+  %select = select i1 %cond, i64 %lshr, i64 0
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, fshr(x, x, y)) -> fshr(x, x, y)
+define i64 @fshr_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @fshr_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[FSHR:%.*]] = call i64 @llvm.fshr.i64(i64 [[A:%.*]], i64 [[A]], i64 [[B_FR]])
+; CHECK-NEXT:    ret i64 [[FSHR]]
+;
+  %cond = icmp eq i64 %a, 0
+  %fshr = call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
+  %select = select i1 %cond, i64 0, i64 %fshr
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (fshl x, x, y)) -> (fshl x, x, y)
+define i64 @fshl_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @fshl_select(
+; CHECK-NEXT:    [[B_FR:%.*]] = freeze i64 [[B:%.*]]
+; CHECK-NEXT:    [[FSHL:%.*]] = call i64 @llvm.fshl.i64(i64 [[A:%.*]], i64 [[A]], i64 [[B_FR]])
+; CHECK-NEXT:    ret i64 [[FSHL]]
+;
+  %cond = icmp eq i64 %a, 0
+  %fshl = call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
+  %select = select i1 %cond, i64 0, i64 %fshl
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (fshr x, z, y)) -> leave as is
+define i64 @fshr_select_no_combine(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @fshr_select_no_combine(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    [[FSHR:%.*]] = call i64 @llvm.fshr.i64(i64 [[A]], i64 [[B:%.*]], i64 [[C:%.*]])
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], i64 0, i64 [[FSHR]]
+; CHECK-NEXT:    ret i64 [[SELECT]]
+;
+  %cond = icmp eq i64 %a, 0
+  %fshr = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+  %select = select i1 %cond, i64 0, i64 %fshr
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (sdiv x, y)) -> (sdiv x, y)
+define i64 @sdiv_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @sdiv_select(
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i64 [[A:%.*]], [[B_FR:%.*]]
+; CHECK-NEXT:    ret i64 [[DIV]]
+;
+  %cond = icmp eq i64 %a, 0
+  %div = sdiv i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %div
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (udiv x, y)) -> (udiv x, y)
+define i64 @udiv_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @udiv_select(
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A:%.*]], [[B_FR:%.*]]
+; CHECK-NEXT:    ret i64 [[DIV]]
+;
+  %cond = icmp eq i64 %a, 0
+  %div = udiv i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %div
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (srem x, y)) -> (srem x, y)
+define i64 @srem_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @srem_select(
+; CHECK-NEXT:    [[REM:%.*]] = srem i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i64 [[REM]]
+;
+  %cond = icmp eq i64 %a, 0
+  %rem = srem i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %rem
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (urem x, y)) -> (urem x, y)
+define i64 @urem_select(i64 %a, i64 %b) {
+; CHECK-LABEL: @urem_select(
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i64 [[REM]]
+;
+  %cond = icmp eq i64 %a, 0
+  %rem = urem i64 %a, %b
+  %select = select i1 %cond, i64 0, i64 %rem
+  ret i64 %select
+}
+
+; (select (icmp x, 0, eq), 0, (icmp x, 0, slt)) -> (icmp x, 0, slt)
+define i1 @icmp_slt_select(i64 %a) {
+; CHECK-LABEL: @icmp_slt_select(
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i64 [[A:%.*]], 0
+; CHECK-NEXT:    ret i1 [[ICMP]]
+;
+  %cond = icmp eq i64 %a, 0
+  %icmp = icmp slt i64 %a, 0
+  %select = select i1 %cond, i1 0, i1 %icmp
+  ret i1 %select
+}
+
+; (select (icmp x, 0, eq), 0, (sub 0, x)) -> (sub 0, x)
+define i64 @sub_select(i64 %a) {
+; CHECK-LABEL: @sub_select(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 0, [[A:%.*]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %cond = icmp eq i64 %a, 0
+  %sub = sub i64 0, %a
+  %select = select i1 %cond, i64 0, i64 %sub
+  ret i64 %select
+}
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index ef5874ffd46ad..1f9ee83536016 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -893,10 +893,9 @@ define i32 @test56(i16 %x) {
 
 define i32 @test57(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test57(
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT:    [[DOTAND:%.*]] = select i1 [[TOBOOL]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[DOTAND]]
+; CHECK-NEXT:    [[Y:%.*]] = freeze i32 [[Y1:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
   %and = and i32 %x, %y
   %tobool = icmp eq i32 %x, 0
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
index d824d6d35643d..3cb6290b1a808 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
@@ -35,8 +35,7 @@ define double @test_atan_neg0() {
 
 define double @test_atan_poison() {
 ; CHECK-LABEL: define double @test_atan_poison() {
-; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.atan.f64(double poison)
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-NEXT:    ret double poison
 ;
   %res = call double @llvm.atan.f64(double poison)
   ret double %res
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll
index a4f318bbc834c..96419382c7b7f 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll
@@ -35,8 +35,7 @@ define double @test_sinh_neg0() {
 
 define double @test_sinh_poison() {
 ; CHECK-LABEL: define double @test_sinh_poison() {
-; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.sinh.f64(double poison)
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-NEXT:    ret double poison
 ;
   %res = call double @llvm.sinh.f64(double poison)
   ret double %res
@@ -121,8 +120,7 @@ define double @test_cosh_neg0() {
 
 define double @test_cosh_poison() {
 ; CHECK-LABEL: define double @test_cosh_poison() {
-; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.cosh.f64(double poison)
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-NEXT:    ret double poison
 ;
   %res = call double @llvm.cosh.f64(double poison)
   ret double %res
diff --git a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
index 8578aa9fa84b3..e4cfa4673a979 100644
--- a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
+++ b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll
@@ -87,3 +87,202 @@ define void @pow_poison(i16 %arg_int,float %arg_flt, ptr %P) {
 
   ret void
 }
+
+define void @sin_poison(ptr %P) {
+; CHECK-LABEL: @sin_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %sin_f32 = call float @llvm.sin(float poison)
+  store volatile float %sin_f32, ptr %P
+
+  %sin_2xf32 = call <2 x float> @llvm.sin(<2 x float> poison)
+  store volatile <2 x float> %sin_2xf32, ptr %P
+
+  %sin_4xf64 = call <4 x double> @llvm.sin(<4 x double> poison)
+  store volatile <4 x double> %sin_4xf64, ptr %P
+
+  %asin_f32 = call float @llvm.asin(float poison)
+  store volatile float %asin_f32, ptr %P
+
+  %asin_2xf32 = call <2 x float> @llvm.asin(<2 x float> poison)
+  store volatile <2 x float> %asin_2xf32, ptr %P
+
+  %asin_4xf64 = call <4 x double> @llvm.asin(<4 x double> poison)
+  store volatile <4 x double> %asin_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @cos_poison(ptr %P) {
+; CHECK-LABEL: @cos_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %cos_f32 = call float @llvm.cos(float poison)
+  store volatile float %cos_f32, ptr %P
+
+  %cos_2xf32 = call <2 x float> @llvm.cos(<2 x float> poison)
+  store volatile <2 x float> %cos_2xf32, ptr %P
+
+  %cos_4xf64 = call <4 x double> @llvm.cos(<4 x double> poison)
+  store volatile <4 x double> %cos_4xf64, ptr %P
+
+  %acos_f32 = call float @llvm.acos(float poison)
+  store volatile float %acos_f32, ptr %P
+
+  %acos_2xf32 = call <2 x float> @llvm.acos(<2 x float> poison)
+  store volatile <2 x float> %acos_2xf32, ptr %P
+
+  %acos_4xf64 = call <4 x double> @llvm.acos(<4 x double> poison)
+  store volatile <4 x double> %acos_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @tan_poison(ptr %P) {
+; CHECK-LABEL: @tan_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile float poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %tan_f32 = call float @llvm.tan(float poison)
+  store volatile float %tan_f32, ptr %P
+
+  %tan_2xf32 = call <2 x float> @llvm.tan(<2 x float> poison)
+  store volatile <2 x float> %tan_2xf32, ptr %P
+
+  %tan_4xf64 = call <4 x double> @llvm.tan(<4 x double> poison)
+  store volatile <4 x double> %tan_4xf64, ptr %P
+
+  %atan_f32 = call float @llvm.atan(float poison)
+  store volatile float %atan_f32, ptr %P
+
+  %atan_2xf32 = call <2 x float> @llvm.atan(<2 x float> poison)
+  store volatile <2 x float> %atan_2xf32, ptr %P
+
+  %atan_4xf64 = call <4 x double> @llvm.atan(<4 x double> poison)
+  store volatile <4 x double> %atan_4xf64, ptr %P
+
+  %atan2_f32 = call float @llvm.atan2(float poison, float poison)
+  store volatile float %atan2_f32, ptr %P
+
+  %atan2_2xf32 = call <2 x float> @llvm.atan2(<2 x float> poison, <2 x float> poison)
+  store volatile <2 x float> %atan2_2xf32, ptr %P
+
+  %atan2_4xf64 = call <4 x double> @llvm.atan2(<4 x double> poison, <4 x double> poison)
+  store volatile <4 x double> %atan2_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @sincos_poison(ptr %P) {
+; CHECK-LABEL: @sincos_poison(
+; CHECK-NEXT:    store volatile { float, float } poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32
+; CHECK-NEXT:    store volatile { float, float } poison, ptr [[P]], align 4
+; CHECK-NEXT:    store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %sincos_f32 = call { float, float } @llvm.sincos(float poison)
+  store volatile { float, float } %sincos_f32, ptr %P
+
+  %sincos_2xf32 = call { <2 x float>, <2 x float> } @llvm.sincos(<2 x float> poison)
+  store volatile { <2 x float>, <2 x float> } %sincos_2xf32, ptr %P
+
+  %sincos_4xf64 = call { <4 x double>, <4 x double> } @llvm.sincos(<4 x double> poison)
+  store volatile { <4 x double>, <4 x double> } %sincos_4xf64, ptr %P
+
+  %sincospi_f32 = call { float, float } @llvm.sincospi(float poison)
+  store volatile { float, float } %sincospi_f32, ptr %P
+
+  %sincospi_2xf32 = call { <2 x float>, <2 x float> } @llvm.sincospi(<2 x float> poison)
+  store volatile { <2 x float>, <2 x float> } %sincospi_2xf32, ptr %P
+
+  %sincospi_4xf64 = call { <4 x double>, <4 x double> } @llvm.sincospi(<4 x double> poison)
+  store volatile { <4 x double>, <4 x double> } %sincospi_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @sinh_poison(ptr %P) {
+; CHECK-LABEL: @sinh_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %sinh_f32 = call float @llvm.sinh(float poison)
+  store volatile float %sinh_f32, ptr %P
+
+  %sinh_2xf32 = call <2 x float> @llvm.sinh(<2 x float> poison)
+  store volatile <2 x float> %sinh_2xf32, ptr %P
+
+  %sinh_4xf64 = call <4 x double> @llvm.sinh(<4 x double> poison)
+  store volatile <4 x double> %sinh_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @cosh_poison(ptr %P) {
+; CHECK-LABEL: @cosh_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %cosh_f32 = call float @llvm.cosh(float poison)
+  store volatile float %cosh_f32, ptr %P
+
+  %cosh_2xf32 = call <2 x float> @llvm.cosh(<2 x float> poison)
+  store volatile <2 x float> %cosh_2xf32, ptr %P
+
+  %cosh_4xf64 = call <4 x double> @llvm.cosh(<4 x double> poison)
+  store volatile <4 x double> %cosh_4xf64, ptr %P
+
+  ret void
+}
+
+
+define void @tanh_poison(ptr %P) {
+; CHECK-LABEL: @tanh_poison(
+; CHECK-NEXT:    store volatile float poison, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store volatile <2 x float> poison, ptr [[P]], align 8
+; CHECK-NEXT:    store volatile <4 x double> poison, ptr [[P]], align 32
+; CHECK-NEXT:    ret void
+;
+  %tanh_f32 = call float @llvm.tanh(float poison)
+  store volatile float %tanh_f32, ptr %P
+
+  %tanh_2xf32 = call <2 x float> @llvm.tanh(<2 x float> poison)
+  store volatile <2 x float> %tanh_2xf32, ptr %P
+
+  %tanh_4xf64 = call <4 x double> @llvm.tanh(<4 x double> poison)
+  store volatile <4 x double> %tanh_4xf64, ptr %P
+
+  ret void
+}
diff --git a/llvm/test/Transforms/InstSimplify/sincos.ll b/llvm/test/Transforms/InstSimplify/sincos.ll
index e0f81ee45af05..144da53c6917b 100644
--- a/llvm/test/Transforms/InstSimplify/sincos.ll
+++ b/llvm/test/Transforms/InstSimplify/sincos.ll
@@ -50,8 +50,7 @@ define { <2 x float>, <2 x float> } @sincos_zero_vector() {
 
 define { float, float } @sincos_poison() {
 ; CHECK-LABEL: define { float, float } @sincos_poison() {
-; CHECK-NEXT:    [[RET:%.*]] = call { float, float } @llvm.sincos.f32(float poison)
-; CHECK-NEXT:    ret { float, float } [[RET]]
+; CHECK-NEXT:    ret { float, float } poison 
 ;
   %ret = call { float, float } @llvm.sincos.f32(float poison)
   ret { float, float } %ret
@@ -59,8 +58,7 @@ define { float, float } @sincos_poison() {
 
 define { <2 x float>, <2 x float> } @sincos_poison_vector() {
 ; CHECK-LABEL: define { <2 x float>, <2 x float> } @sincos_poison_vector() {
-; CHECK-NEXT:    [[RET:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> poison)
-; CHECK-NEXT:    ret { <2 x float>, <2 x float> } [[RET]]
+; CHECK-NEXT:    ret { <2 x float>, <2 x float> } poison 
 ;
   %ret = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> poison)
   ret { <2 x float>, <2 x float> } %ret
@@ -68,8 +66,7 @@ define { <2 x float>, <2 x float> } @sincos_poison_vector() {
 
 define { <vscale x 2 x float>, <vscale x 2 x float> } @sincos_poison_scalable_vector() {
 ; CHECK-LABEL: define { <vscale x 2 x float>, <vscale x 2 x float> } @sincos_poison_scalable_vector() {
-; CHECK-NEXT:    [[RET:%.*]] = call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.sincos.nxv2f32(<vscale x 2 x float> poison)
-; CHECK-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float> } [[RET]]
+; CHECK-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float> } poison
 ;
   %ret = call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.sincos.nxv2f32(<vscale x 2 x float> poison)
   ret { <vscale x 2 x float>, <vscale x 2 x float> } %ret
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 09e2c53465cd7..6c81d9a4d2ed6 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -10,8 +10,8 @@ define void @deinterleave_i8_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i8_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2
@@ -33,8 +33,8 @@ define void @deinterleave_i16_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i16_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i16_factor2
@@ -56,8 +56,8 @@ define void @deinterleave_8xi32_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_8xi32_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_8xi32_factor2
@@ -79,8 +79,8 @@ define void @deinterleave_i64_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_i64_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_i64_factor2
@@ -102,8 +102,8 @@ define void @deinterleave_float_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_float_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_float_factor2
@@ -125,8 +125,8 @@ define void @deinterleave_double_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_double_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_double_factor2
@@ -148,8 +148,8 @@ define void @deinterleave_ptr_factor2(ptr %ptr) {
 ; NEON-LABEL: define void @deinterleave_ptr_factor2
 ; NEON-SAME: (ptr [[PTR:%.*]]) {
 ; NEON-NEXT:    [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
-; NEON-NEXT:    [[TMP1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
-; NEON-NEXT:    [[TMP2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_ptr_factor2
@@ -301,6 +301,10 @@ define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; NEON-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
 ; NEON-NEXT:    [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
 ; NEON-NEXT:    [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
+; NEON-NEXT:    [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0
+; NEON-NEXT:    [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1
+; NEON-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 0
+; NEON-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 1
 ; NEON-NEXT:    ret void
 ;
 ; SVE-FIXED-LABEL: define void @deinterleave_wide_i16_factor2
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 436389ba5b991..d7649801ea2fc 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -8,8 +8,8 @@ define void @deinterleave_nxi8_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi8_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
@@ -23,8 +23,8 @@ define void @deinterleave_nxi16_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi16_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
@@ -38,8 +38,8 @@ define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
@@ -53,8 +53,8 @@ define void @deinterleave_nxi64_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxi64_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
@@ -68,8 +68,8 @@ define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxfloat_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
@@ -83,8 +83,8 @@ define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxdouble_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
@@ -98,8 +98,8 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #0 {
 ; CHECK-LABEL: define void @deinterleave_nxptr_factor2
 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> splat (i1 true), ptr [[PTR]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
@@ -215,6 +215,10 @@ define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } poison, <vscale x 16 x i32> [[TMP18]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP21]], <vscale x 16 x i32> [[TMP20]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
@@ -239,6 +243,10 @@ define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 1
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index c565066541d1d..58c0bccc3be38 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -49,8 +49,16 @@ define void @wide_deinterleave4(ptr %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
-; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP12]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP19]], <vscale x 8 x i32> [[TMP14]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP20]], <vscale x 8 x i32> [[TMP16]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP21]], <vscale x 8 x i32> [[TMP18]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP22]], 3
+; CHECK-NEXT:    [[SUM:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %src, align 4
@@ -73,8 +81,8 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
 ; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[SRC]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT:    [[LD2_1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT:    [[LD2_2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
 ; CHECK-NEXT:    ret void
 ;
 
@@ -95,12 +103,11 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 define void @negative_deinterleave4_test(ptr %src) {
 ; CHECK-LABEL: define void @negative_deinterleave4_test
 ; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[LOAD]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[SRC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i32>, ptr %src, align 4
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index b109448bd5d7c..1418ca09c0d61 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -606,6 +606,10 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #2 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 0
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]], 1
 ; CHECK-NEXT:    ret void
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/addrspace.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/addrspace.ll
index 8d156cfd4e526..f8e4ce9548147 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/addrspace.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/addrspace.ll
@@ -20,7 +20,7 @@ define void @load_factor2(ptr addrspace(1) %ptr) {
 define void @load_factor2_vscale(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define void @load_factor2_vscale(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t.p1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr addrspace(1) [[PTR]], i64 -1, i64 5)
+; CHECK-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.p1.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr addrspace(1) [[PTR]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP1]], i32 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP1]], i32 1)
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index 72c1f22032bb7..672e94962da6d 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -25,7 +25,7 @@ define void @load_factor2(ptr %ptr) {
 
 define void @load_factor2_vscale(ptr %ptr) {
 ; RV32-LABEL: @load_factor2_vscale(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t.p0.i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3, i32 5)
 ; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP1]], i32 0)
 ; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP2]], 0
 ; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP1]], i32 1)
@@ -35,7 +35,7 @@ define void @load_factor2_vscale(ptr %ptr) {
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @load_factor2_vscale(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t.p0.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP1]], i32 0)
 ; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[TMP2]], 0
 ; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[TMP1]], i32 1)
@@ -75,7 +75,7 @@ define void @load_factor3(ptr %ptr) {
 
 define void @load_factor3_vscale(ptr %ptr) {
 ; RV32-LABEL: @load_factor3_vscale(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv2i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i32 -1, i32 3, i32 5)
 ; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP1]], i32 0)
 ; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP1]], i32 1)
@@ -88,7 +88,7 @@ define void @load_factor3_vscale(ptr %ptr) {
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @load_factor3_vscale(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP1]], i32 0)
 ; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP1]], i32 1)
@@ -135,7 +135,7 @@ define void @load_factor4(ptr %ptr) {
 
 define void @load_factor4_vscale(ptr %ptr) {
 ; RV32-LABEL: @load_factor4_vscale(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv4i1.i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], <vscale x 4 x i1> splat (i1 true), i32 -1, i32 3, i32 5)
 ; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
 ; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
 ; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
@@ -151,7 +151,7 @@ define void @load_factor4_vscale(ptr %ptr) {
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @load_factor4_vscale(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.p0.nxv4i1.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], <vscale x 4 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
 ; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
 ; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
@@ -205,7 +205,7 @@ define void @load_factor5(ptr %ptr) {
 
 define void @load_factor5_vscale(ptr %ptr) {
 ; RV32-LABEL: @load_factor5_vscale(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv2i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i32 -1, i32 3, i32 5)
 ; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP1]], i32 0)
 ; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP1]], i32 1)
@@ -224,7 +224,7 @@ define void @load_factor5_vscale(ptr %ptr) {
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @load_factor5_vscale(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP1]], i32 0)
 ; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP1]], i32 1)
@@ -285,7 +285,7 @@ define void @load_factor6(ptr %ptr) {
 
 define void @load_factor6_vscale(ptr %ptr) {
 ; RV32-LABEL: @load_factor6_vscale(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv2i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i32 -1, i32 3, i32 5)
 ; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 0)
 ; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 1)
@@ -307,7 +307,7 @@ define void @load_factor6_vscale(ptr %ptr) {
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @load_factor6_vscale(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 0)
 ; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP1]], i32 1)
@@ -375,7 +375,7 @@ define void @load_factor7(ptr %ptr) {
 
 define void @load_factor7_vscale(ptr %ptr) {
 ; RV32-LABEL: @load_factor7_vscale(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv2i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i32 -1, i32 3, i32 5)
 ; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP1]], i32 0)
 ; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP1]], i32 1)
@@ -400,7 +400,7 @@ define void @load_factor7_vscale(ptr %ptr) {
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @load_factor7_vscale(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP1]], i32 0)
 ; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP1]], i32 1)
@@ -475,7 +475,7 @@ define void @load_factor8(ptr %ptr) {
 
 define void @load_factor8_vscale(ptr %ptr) {
 ; RV32-LABEL: @load_factor8_vscale(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
+; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv2i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i32 -1, i32 3, i32 5)
 ; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
 ; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
@@ -503,7 +503,7 @@ define void @load_factor8_vscale(ptr %ptr) {
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @load_factor8_vscale(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
+; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv2i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], <vscale x 2 x i1> splat (i1 true), i64 -1, i64 3, i64 5)
 ; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
 ; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
 ; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
@@ -566,13 +566,13 @@ define void @store_factor2_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV32-LABEL: @store_factor2_vscale(
 ; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
 ; RV32-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
-; RV32-NEXT:    call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv8i8_2t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP2]], ptr [[PTR:%.*]], i32 -1, i32 3)
+; RV32-NEXT:    call void @llvm.riscv.vsseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP2]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3)
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @store_factor2_vscale(
 ; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
 ; RV64-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
-; RV64-NEXT:    call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv8i8_2t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP2]], ptr [[PTR:%.*]], i64 -1, i64 3)
+; RV64-NEXT:    call void @llvm.riscv.vsseg2.mask.triscv.vector.tuple_nxv8i8_2t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) [[TMP2]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3)
 ; RV64-NEXT:    ret void
 ;
   %interleaved.vec = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
@@ -611,14 +611,14 @@ define void @store_factor3_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
 ; RV32-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
 ; RV32-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
-; RV32-NEXT:    call void @llvm.riscv.vsseg3.triscv.vector.tuple_nxv8i8_3t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP3]], ptr [[PTR:%.*]], i32 -1, i32 3)
+; RV32-NEXT:    call void @llvm.riscv.vsseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP3]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3)
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @store_factor3_vscale(
 ; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
 ; RV64-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
 ; RV64-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
-; RV64-NEXT:    call void @llvm.riscv.vsseg3.triscv.vector.tuple_nxv8i8_3t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP3]], ptr [[PTR:%.*]], i64 -1, i64 3)
+; RV64-NEXT:    call void @llvm.riscv.vsseg3.mask.triscv.vector.tuple_nxv8i8_3t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) [[TMP3]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3)
 ; RV64-NEXT:    ret void
 ;
   %interleaved.vec = call <vscale x 24 x i8> @llvm.vector.interleave3.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2)
@@ -660,7 +660,7 @@ define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV32-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
 ; RV32-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
 ; RV32-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
-; RV32-NEXT:    call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3)
+; RV32-NEXT:    call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3)
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @store_factor4_vscale(
@@ -668,7 +668,7 @@ define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV64-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V1:%.*]], i32 1)
 ; RV64-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
 ; RV64-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
-; RV64-NEXT:    call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i64 -1, i64 3)
+; RV64-NEXT:    call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv8i8_4t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3)
 ; RV64-NEXT:    ret void
 ;
   %interleaved.vec = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3)
@@ -683,7 +683,7 @@ define void @store_factor5_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV32-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
 ; RV32-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
 ; RV32-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
-; RV32-NEXT:    call void @llvm.riscv.vsseg5.triscv.vector.tuple_nxv8i8_5t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP5]], ptr [[PTR:%.*]], i32 -1, i32 3)
+; RV32-NEXT:    call void @llvm.riscv.vsseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP5]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3)
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @store_factor5_vscale(
@@ -692,7 +692,7 @@ define void @store_factor5_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV64-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP2]], <vscale x 8 x i8> [[V2:%.*]], i32 2)
 ; RV64-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
 ; RV64-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
-; RV64-NEXT:    call void @llvm.riscv.vsseg5.triscv.vector.tuple_nxv8i8_5t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP5]], ptr [[PTR:%.*]], i64 -1, i64 3)
+; RV64-NEXT:    call void @llvm.riscv.vsseg5.mask.triscv.vector.tuple_nxv8i8_5t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) [[TMP5]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3)
 ; RV64-NEXT:    ret void
 ;
   %interleaved.vec = call <vscale x 40 x i8> @llvm.vector.interleave5.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4)
@@ -780,7 +780,7 @@ define void @store_factor6_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV32-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
 ; RV32-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
 ; RV32-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
-; RV32-NEXT:    call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv8i8_6t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP6]], ptr [[PTR:%.*]], i32 -1, i32 3)
+; RV32-NEXT:    call void @llvm.riscv.vsseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP6]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3)
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @store_factor6_vscale(
@@ -790,7 +790,7 @@ define void @store_factor6_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV64-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP3]], <vscale x 8 x i8> [[V3:%.*]], i32 3)
 ; RV64-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
 ; RV64-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
-; RV64-NEXT:    call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv8i8_6t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP6]], ptr [[PTR:%.*]], i64 -1, i64 3)
+; RV64-NEXT:    call void @llvm.riscv.vsseg6.mask.triscv.vector.tuple_nxv8i8_6t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) [[TMP6]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3)
 ; RV64-NEXT:    ret void
 ;
   %interleaved.vec = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5)
@@ -807,7 +807,7 @@ define void @store_factor7_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV32-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
 ; RV32-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
 ; RV32-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP6]], <vscale x 8 x i8> [[V6:%.*]], i32 6)
-; RV32-NEXT:    call void @llvm.riscv.vsseg7.triscv.vector.tuple_nxv8i8_7t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP7]], ptr [[PTR:%.*]], i32 -1, i32 3)
+; RV32-NEXT:    call void @llvm.riscv.vsseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP7]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3)
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @store_factor7_vscale(
@@ -818,7 +818,7 @@ define void @store_factor7_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV64-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP4]], <vscale x 8 x i8> [[V4:%.*]], i32 4)
 ; RV64-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
 ; RV64-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP6]], <vscale x 8 x i8> [[V6:%.*]], i32 6)
-; RV64-NEXT:    call void @llvm.riscv.vsseg7.triscv.vector.tuple_nxv8i8_7t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP7]], ptr [[PTR:%.*]], i64 -1, i64 3)
+; RV64-NEXT:    call void @llvm.riscv.vsseg7.mask.triscv.vector.tuple_nxv8i8_7t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) [[TMP7]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3)
 ; RV64-NEXT:    ret void
 ;
   %interleaved.vec = call <vscale x 56 x i8> @llvm.vector.interleave7.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5, <vscale x 8 x i8> %v6)
@@ -836,7 +836,7 @@ define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV32-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
 ; RV32-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V6:%.*]], i32 6)
 ; RV32-NEXT:    [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V7:%.*]], i32 7)
-; RV32-NEXT:    call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3)
+; RV32-NEXT:    call void @llvm.riscv.vsseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv8i1.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i32 -1, i32 3)
 ; RV32-NEXT:    ret void
 ;
 ; RV64-LABEL: @store_factor8_vscale(
@@ -848,7 +848,7 @@ define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
 ; RV64-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V5:%.*]], i32 5)
 ; RV64-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V6:%.*]], i32 6)
 ; RV64-NEXT:    [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V7:%.*]], i32 7)
-; RV64-NEXT:    call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i64 -1, i64 3)
+; RV64-NEXT:    call void @llvm.riscv.vsseg8.mask.triscv.vector.tuple_nxv8i8_8t.p0.nxv8i1.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], <vscale x 8 x i1> splat (i1 true), i64 -1, i64 3)
 ; RV64-NEXT:    ret void
 ;
   %interleaved.vec = call <vscale x 64 x i8> @llvm.vector.interleave8.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4, <vscale x 8 x i8> %v5, <vscale x 8 x i8> %v6, <vscale x 8 x i8> %v7)
diff --git a/llvm/test/Transforms/LoopInterchange/fp-reductions.ll b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll
new file mode 100644
index 0000000000000..0703a7b27979a
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll
@@ -0,0 +1,437 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-output=%t -disable-output \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa
+; RUN: FileCheck -input-file=%t %s
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point addition.
+;
+; float sum = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     sum += A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fadd
+define void @reduction_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point addition is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fadd
+define void @reduction_reassoc_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd reassoc float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; FIXME: Is it really legal to interchange the loops when
+; both reassoc and ninf are set?
+; Check that the interchange is legal if the floating-point addition is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_ninf_fadd
+define void @reduction_reassoc_ninf_fadd(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %sum.j.next = fadd reassoc ninf float %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi float [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point multiplication.
+;
+; float prod = 1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     prod *= A[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmul
+define void @reduction_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point multiplication is
+; marked as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmul
+define void @reduction_reassoc_fmul(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %prod.j.next = fmul reassoc float %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi float [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the loops aren't exchanged if there is a reduction of
+; non-reassociative floating-point fmuladd.
+;
+; float fmuladd = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmuladd += A[j][i] * B[j][i];
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_fmuladd
+define void @reduction_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that the interchange is legal if the floating-point fmuladd is marked
+; as reassoc.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_reassoc_fmuladd
+define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ]
+  %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx.a, align 4
+  %b = load float, ptr %idx.b, align 4
+  %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the reassociative
+; floating-point minimum.
+;
+; float fmin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmin = (A[j][i] < fmin) ? A[j][i] : fmin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmin
+define void @reduction_fmin(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz olt float %a, %fmin.j
+  %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the floating-point
+; llvm.minimumnum.
+;
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmininumnum
+define void @reduction_fmininumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the reassociative
+; floating-point maximum.
+;
+; float fmax = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     fmax = (A[j][i] > fmax) ? A[j][i] : fmax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmax
+define void @reduction_fmax(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %cmp = fcmp nnan nsz ogt float %a, %fmax.j
+  %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that interchanging the loops is legal for the floating-point
+; llvm.maximumnum.
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_fmaxinumnum
+define void @reduction_fmaxinumnum(ptr %A, float %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load float, ptr %idx, align 4
+  %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j)
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
+declare float @llvm.minimumnum.f32(float %a, float %b)
+declare float @llvm.maximumnum.f32(float %a, float %b)
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
new file mode 100644
index 0000000000000..f5c6ad7889366
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll
@@ -0,0 +1,335 @@
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-output=%t -disable-output \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa
+; RUN: FileCheck -input-file=%t %s
+
+; Check that interchanging the loops is legal for the bitwise-or reduction.
+;
+; int b_or = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     b_or |= A[j][i];
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_or
+define void @reduction_or(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %or.i = phi i32 [ 0, %entry ], [ %or.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %or.j = phi i32 [ %or.i, %for.i.header ], [ %or.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %or.j.next = or i32 %or.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %or.i.lcssa = phi i32 [ %or.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the bitwise-and reduction.
+;
+; int b_and = -1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     b_and &= A[j][i];
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_and
+define void @reduction_and(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %and.i = phi i32 [ -1, %entry ], [ %and.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %and.j = phi i32 [ %and.i, %for.i.header ], [ %and.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %and.j.next = and i32 %and.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %and.i.lcssa = phi i32 [ %and.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the bitwise-xor reduction.
+;
+; int b_xor = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     b_xor ^= A[j][i];
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_xor
+define void @reduction_xor(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %xor.i = phi i32 [ 0, %entry ], [ %xor.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %xor.j = phi i32 [ %xor.i, %for.i.header ], [ %xor.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %xor.j.next = xor i32 %xor.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %xor.i.lcssa = phi i32 [ %xor.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the signed-minimum reduction.
+;
+; int smin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     smin = (A[j][i] < smin) ? A[j][i] : smin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_smin
+define void @reduction_smin(ptr %A, i32 %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %smin.i = phi i32 [ %init, %entry ], [ %smin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %smin.j = phi i32 [ %smin.i, %for.i.header ], [ %smin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp slt i32 %a, %smin.j
+  %smin.j.next = select i1 %cmp, i32 %a, i32 %smin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %smin.i.lcssa = phi i32 [ %smin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the signed-maximum reduction.
+;
+; int smax = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     smax = (A[j][i] > smax) ? A[j][i] : smax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_smax
+define void @reduction_smax(ptr %A, i32 %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %smax.i = phi i32 [ %init, %entry ], [ %smax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %smax.j = phi i32 [ %smax.i, %for.i.header ], [ %smax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp sgt i32 %a, %smax.j
+  %smax.j.next = select i1 %cmp, i32 %a, i32 %smax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %smax.i.lcssa = phi i32 [ %smax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the unsigned-minimum reduction.
+;
+; unsigned umin = init;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     umin = (A[j][i] < umin) ? A[j][i] : umin;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_umin
+define void @reduction_umin(ptr %A, i32 %init) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %umin.i = phi i32 [ %init, %entry ], [ %umin.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %umin.j = phi i32 [ %umin.i, %for.i.header ], [ %umin.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp ult i32 %a, %umin.j
+  %umin.j.next = select i1 %cmp, i32 %a, i32 %umin.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %umin.i.lcssa = phi i32 [ %umin.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the unsigned-maximum reduction.
+;
+; unsigned umax = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     smax = (A[j][i] > smax) ? A[j][i] : smax;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_umax
+define void @reduction_umax(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %umax.i = phi i32 [ 0, %entry ], [ %umax.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %umax.j = phi i32 [ %umax.i, %for.i.header ], [ %umax.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp ugt i32 %a, %umax.j
+  %umax.j.next = select i1 %cmp, i32 %a, i32 %umax.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %umax.i.lcssa = phi i32 [ %umax.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that interchanging the loops is legal for the any-of reduction.
+;
+; int any_of = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     any_of = (A[j][i] == 42) ? 1 : any_of;
+
+; CHECK:      --- !Pass
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        reduction_anyof
+define void @reduction_anyof(ptr %A) {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %anyof.i = phi i32 [ 0, %entry ], [ %anyof.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %anyof.j = phi i32 [ %anyof.i, %for.i.header ], [ %anyof.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %cmp = icmp eq i32 %a, 42
+  %anyof.j.next = select i1 %cmp, i32 1, i32 %anyof.j
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %anyof.i.lcssa = phi i32 [ %anyof.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll b/llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll
new file mode 100644
index 0000000000000..5c05f963a2f3e
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/reductions-with-nowraps.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-interchange -cache-line-size=64 -S < %s | FileCheck %s
+
+; Check that nsw/nuw flags are dropped when interchanging loops.
+;
+; int sum = 0;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     sum += A[j][i];
+;
+define void @reduction_add(ptr %A) {
+; CHECK-LABEL: define void @reduction_add(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_J_PREHEADER:.*]]
+; CHECK:       [[FOR_I_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_I_HEADER:.*]]
+; CHECK:       [[FOR_I_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INC:%.*]], %[[FOR_I_LATCH:.*]] ], [ 0, %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_J:%.*]] = phi i32 [ [[SUM_J_NEXT:%.*]], %[[FOR_I_LATCH]] ], [ [[SUM_I:%.*]], %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_J_SPLIT1:.*]]
+; CHECK:       [[FOR_J_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_J:.*]]
+; CHECK:       [[FOR_J]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[TMP0:%.*]], %[[FOR_J_SPLIT:.*]] ], [ 0, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_I]] = phi i32 [ [[SUM_I_LCSSA:%.*]], %[[FOR_J_SPLIT]] ], [ 0, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_I_HEADER_PREHEADER]]
+; CHECK:       [[FOR_J_SPLIT1]]:
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[A]], i32 0, i32 [[J]], i32 [[I]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[IDX]], align 4
+; CHECK-NEXT:    [[SUM_J_NEXT]] = add i32 [[SUM_J]], [[A]]
+; CHECK-NEXT:    [[J_INC:%.*]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], 2
+; CHECK-NEXT:    br label %[[FOR_I_LATCH]]
+; CHECK:       [[FOR_J_SPLIT]]:
+; CHECK-NEXT:    [[SUM_I_LCSSA]] = phi i32 [ [[SUM_J_NEXT]], %[[FOR_I_LATCH]] ]
+; CHECK-NEXT:    [[TMP0]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_J]], label %[[EXIT:.*]]
+; CHECK:       [[FOR_I_LATCH]]:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[I_INC]], 2
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[FOR_I_HEADER]], label %[[FOR_J_SPLIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %sum.i = phi i32 [ 0, %entry ], [ %sum.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %sum.j = phi i32 [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %sum.j.next = add nuw nsw i32 %sum.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %sum.i.lcssa = phi i32 [ %sum.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check that nsw/nuw flags are dropped when interchanging loops.
+;
+; int prod = 1;
+; for (int i = 0; i < 2; i++)
+;   for (int j = 0; j < 2; j++)
+;     prod *= A[j][i];
+;
+define void @reduction_mul(ptr %A) {
+; CHECK-LABEL: define void @reduction_mul(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_J_PREHEADER:.*]]
+; CHECK:       [[FOR_I_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_I_HEADER:.*]]
+; CHECK:       [[FOR_I_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INC:%.*]], %[[FOR_I_LATCH:.*]] ], [ 0, %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[PROD_J:%.*]] = phi i32 [ [[PROD_J_NEXT:%.*]], %[[FOR_I_LATCH]] ], [ [[PROD_I:%.*]], %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_J_SPLIT1:.*]]
+; CHECK:       [[FOR_J_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_J:.*]]
+; CHECK:       [[FOR_J]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[TMP0:%.*]], %[[FOR_J_SPLIT:.*]] ], [ 0, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    [[PROD_I]] = phi i32 [ [[PROD_I_LCSSA:%.*]], %[[FOR_J_SPLIT]] ], [ 1, %[[FOR_J_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_I_HEADER_PREHEADER]]
+; CHECK:       [[FOR_J_SPLIT1]]:
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[A]], i32 0, i32 [[J]], i32 [[I]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[IDX]], align 4
+; CHECK-NEXT:    [[PROD_J_NEXT]] = mul i32 [[PROD_J]], [[A]]
+; CHECK-NEXT:    [[J_INC:%.*]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], 2
+; CHECK-NEXT:    br label %[[FOR_I_LATCH]]
+; CHECK:       [[FOR_J_SPLIT]]:
+; CHECK-NEXT:    [[PROD_I_LCSSA]] = phi i32 [ [[PROD_J_NEXT]], %[[FOR_I_LATCH]] ]
+; CHECK-NEXT:    [[TMP0]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_J]], label %[[EXIT:.*]]
+; CHECK:       [[FOR_I_LATCH]]:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[I_INC]], 2
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[FOR_I_HEADER]], label %[[FOR_J_SPLIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ]
+  %prod.i = phi i32 [ 1, %entry ], [ %prod.i.lcssa, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ]
+  %prod.j = phi i32 [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ]
+  %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i
+  %a = load i32, ptr %idx, align 4
+  %prod.j.next = mul nsw nuw i32 %prod.j, %a
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 2
+  br i1 %cmp.j, label %for.j, label %for.i.latch
+
+for.i.latch:
+  %prod.i.lcssa = phi i32 [ %prod.j.next, %for.j ]
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 2
+  br i1 %cmp.i, label %for.i.header, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index 1a091e847ca34..0b78beea54aa9 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -578,8 +578,323 @@ loop.latch:
 exit:
   ret void
 }
+
+define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_unroll_partial(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    ret i32 [[BIN_RDX2]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_unroll_partial(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; OTHER-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP2]]
+; OTHER-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; OTHER-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; OTHER-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP3]]
+; OTHER-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; OTHER-NEXT:    [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024
+; OTHER-NEXT:    br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    ret i32 [[BIN_RDX2]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+declare i1 @cond()
+
+define i32 @test_add_reduction_multi_block(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_multi_block(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[C:%.*]] = call i1 @cond()
+; APPLE-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE:       [[THEN]]:
+; APPLE-NEXT:    store i32 0, ptr [[GEP_A]], align 4
+; APPLE-NEXT:    br label %[[LOOP_LATCH]]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    ret i32 [[RES]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_multi_block(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[C:%.*]] = call i1 @cond()
+; OTHER-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; OTHER:       [[THEN]]:
+; OTHER-NEXT:    store i32 0, ptr [[GEP_A]], align 4
+; OTHER-NEXT:    br label %[[LOOP_LATCH]]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; OTHER-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop.latch ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %c = call i1 @cond()
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i32 0, ptr %gep.a
+  br label %loop.latch
+
+loop.latch:
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop.latch ]
+  ret i32 %res
+}
+
+define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; APPLE-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; APPLE-NEXT:    [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; APPLE-NEXT:    [[RDX_2_NEXT]] = mul i32 [[RDX_2]], [[TMP0]]
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT]], %[[LOOP]] ]
+; APPLE-NEXT:    [[SUM:%.*]] = add i32 [[BIN_RDX3]], [[RES_2]]
+; APPLE-NEXT:    ret i32 [[SUM]]
+;
+; OTHER-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]]
+; OTHER-NEXT:    [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[RDX_2_NEXT_1]] = mul i32 [[RDX_2_NEXT]], [[TMP1]]
+; OTHER-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_NEXT_1]], 1024
+; OTHER-NEXT:    br i1 [[EC_1]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[BIN_RDX:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT_1]], %[[LOOP]] ]
+; OTHER-NEXT:    [[SUM:%.*]] = add i32 [[BIN_RDX]], [[RES_2]]
+; OTHER-NEXT:    ret i32 [[SUM]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %rdx.2 = phi i32 [ 0, %entry ], [ %rdx.2.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %rdx.2.next = mul i32 %rdx.2, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res.1 = phi i32 [ %rdx.next, %loop ]
+  %res.2 = phi i32 [ %rdx.2.next, %loop ]
+  %sum = add i32 %res.1, %res.2
+  ret i32 %sum
+}
+
+
+define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) {
+; APPLE-LABEL: define i32 @test_add_reduction_runtime(
+; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; APPLE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; APPLE-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
+; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; APPLE-NEXT:    br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP]] ]
+; APPLE-NEXT:    ret i32 [[RES]]
+;
+; OTHER-LABEL: define i32 @test_add_reduction_runtime(
+; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; OTHER-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 3
+; OTHER-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
+; OTHER-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; OTHER:       [[ENTRY_NEW]]:
+; OTHER-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; OTHER-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; OTHER-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; OTHER-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; OTHER-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; OTHER-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]]
+; OTHER-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; OTHER-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; OTHER-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]]
+; OTHER-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; OTHER-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; OTHER-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; OTHER-NEXT:    br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; OTHER-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; OTHER-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; OTHER:       [[EXIT_UNR_LCSSA]]:
+; OTHER-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; OTHER-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; OTHER:       [[LOOP_EPIL_PREHEADER]]:
+; OTHER-NEXT:    br label %[[LOOP_EPIL:.*]]
+; OTHER:       [[LOOP_EPIL]]:
+; OTHER-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; OTHER-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; OTHER-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
+; OTHER-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; OTHER-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; OTHER-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; OTHER-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; OTHER-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; OTHER:       [[EXIT_EPILOG_LCSSA]]:
+; OTHER-NEXT:    [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; OTHER-NEXT:    br label %[[EXIT]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
+; OTHER-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
 ;.
 ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
 ; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
 ;.
+; OTHER: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; OTHER: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
new file mode 100644
index 0000000000000..953dc278b6644
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
@@ -0,0 +1,446 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
+
+define i32 @test_add(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_tc_not_multiple_of_4(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_tc_not_multiple_of_4(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_1:.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 1001
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_1]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i32, ptr [[GEP_SRC_12]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_12]]
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1001
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_rdx_used_in_loop(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_rdx_used_in_loop(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT]], ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_1]], ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_2]], ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_24]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    store i32 [[RDX_NEXT_24]], ptr [[GEP_SRC_24]], align 4
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  store i32 %rdx.next, ptr %gep.src
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i32 @test_add_phi_used_outside_loop(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_phi_used_outside_loop(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_2]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = add i32 %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx
+}
+
+define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) {
+; CHECK-LABEL: define i32 @test_add_and_mul_reduction(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]]
+; CHECK-NEXT:    [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]]
+; CHECK-NEXT:    [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx.1 = phi i32 [ %start, %entry ], [ %rdx.1.next, %loop ]
+  %rdx.2 = phi i32 [ %start, %entry ], [ %rdx.2.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.1.next = add i32 %rdx.1, %l
+  %rdx.2.next = mul i32 %rdx.2, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %res = add i32 %rdx.1.next, %rdx.2.next
+  ret i32 %res
+}
+
+define float @test_fadd_no_fmfs(ptr %src, i64 %n, float %start) {
+; CHECK-LABEL: define float @test_fadd_no_fmfs(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 1
+  %rdx.next = fadd float %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %rdx.next
+}
+
+define float @test_fadd_with_ressaoc(ptr %src, i64 %n, float %start) {
+; CHECK-LABEL: define float @test_fadd_with_ressaoc(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 1
+  %rdx.next = fadd float %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %rdx.next
+}
+define i32 @test_smin(ptr %src, i64 %n) {
+; CHECK-LABEL: define i32 @test_smin(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi i32 [ 1000, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = call i32 @llvm.smin.i32(i32 [[MIN]], i32 [[L]])
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT]], i32 [[L_1]])
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_1]], i32 [[L_2]])
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[RDX_NEXT_3]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_2]], i32 [[L_24]])
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi i32 [ 1000, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32 , ptr %gep.src, align 1
+  %rdx.next = call i32 @llvm.smin(i32 %min, i32 %l)
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i32 %rdx.next
+}
+
+define i64 @test_any_of_reduction(ptr %src, i64 %n) {
+; CHECK-LABEL: define i64 @test_any_of_reduction(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ANY_OF_RDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = select i1 [[C]], i64 [[ANY_OF_RDX]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L_1]], 0
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = select i1 [[C_1]], i64 [[RDX_NEXT]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[L_2]], 0
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = select i1 [[C_2]], i64 [[RDX_NEXT_1]], i64 0
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_24:%.*]] = load i8, ptr [[GEP_SRC_24]], align 1
+; CHECK-NEXT:    [[C_24:%.*]] = icmp eq i8 [[L_24]], 0
+; CHECK-NEXT:    [[RDX_NEXT_3]] = select i1 [[C_24]], i64 [[RDX_NEXT_2]], i64 0
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %any.of.rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src, align 1
+  %c = icmp eq i8 %l, 0
+  %rdx.next = select i1 %c, i64 %any.of.rdx, i64 0
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i64 %rdx.next
+}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
new file mode 100644
index 0000000000000..89f06ad373aa9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -S %s | FileCheck %s
+
+define i32 @test_add_reduction(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], [[TMP4]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_add_reduction_constant_op(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction_constant_op(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX]], 2
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], 1
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %rdx.next = add nuw nsw i32 %rdx, 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+define i32 @test_add_reduction_8x_unroll(ptr %a, i64 %n) {
+; CHECK-LABEL: define i32 @test_add_reduction_8x_unroll(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
+; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
+; CHECK-NEXT:    [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
+; CHECK-NEXT:    [[RDX_4:%.*]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]]
+; CHECK-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_A_4:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP_A_4]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_4:%.*]] = add nuw nsw i32 [[RDX_4]], [[TMP6]]
+; CHECK-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
+; CHECK-NEXT:    [[GEP_A_5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[GEP_A_5]], align 2
+; CHECK-NEXT:    [[RDX_6:%.*]] = add nuw nsw i32 [[RDX_NEXT_4]], [[TMP7]]
+; CHECK-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
+; CHECK-NEXT:    [[GEP_A_6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[GEP_A_6]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_6:%.*]] = add nuw nsw i32 [[RDX_6]], [[TMP8]]
+; CHECK-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
+; CHECK-NEXT:    [[GEP_A_7:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[GEP_A_7]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_7]] = add nuw nsw i32 [[RDX_NEXT_6]], [[TMP9]]
+; CHECK-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
+; CHECK-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; CHECK-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
+; CHECK-NEXT:    [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP10]]
+; CHECK-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
+; CHECK-NEXT:    [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
+; CHECK-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT_EPILOG_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %1 = load i32, ptr %gep.a, align 2
+  %rdx.next = add nuw nsw i32 %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !2
+
+exit:
+  %res = phi i32 [ %rdx.next, %loop ]
+  ret i32 %res
+}
+
+
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.count", i32 2}
+
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.unroll.count", i32 8}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 5f72fa4b4e8e9..46a194dacad9e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1484,7 +1484,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
 ; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 1)
-; DEFAULT-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i32>
+; DEFAULT-NEXT:    [[TMP1:%.*]] = trunc nuw nsw <4 x i64> [[TMP0]] to <4 x i32>
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 ; DEFAULT-NEXT:    store i32 [[TMP2]], ptr [[DST]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -1521,7 +1521,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
 ; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20)
 ; PRED-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
-; PRED-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
+; PRED-NEXT:    [[TMP2:%.*]] = trunc nuw nsw <4 x i64> [[TMP1]] to <4 x i32>
 ; PRED-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; PRED-NEXT:    br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; PRED:       [[PRED_STORE_IF]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
new file mode 100644
index 0000000000000..e93ee5563b057
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -mtriple=arm64-apple-macosx -S %s | FileCheck %s
+
+define float @fmax_ogt_with_select(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmax_ogt_with_select(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp olt float %l, %min
+  %min.next = select i1 %cmp, float %l, float %min
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fminnum(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fminnum(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %min.next = call float @llvm.minnum.f32(float %min, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 4af4929fad521..767bc356a363e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -114,8 +114,8 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -170,8 +170,8 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -319,8 +319,8 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index 31be8862a8872..57d5b4331b7d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -844,8 +844,8 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP5]], i32 14
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP5]], i32 15
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP5]], i32 14
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 0fc324f720e60..0c79086c80b5a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -504,8 +504,8 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
@@ -533,8 +533,8 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -562,8 +562,8 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 887f95ce31f47..14725e0eb2096 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -868,11 +868,11 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP22]]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP22]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP26]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP26]]
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -922,11 +922,11 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP35]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP35]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -970,11 +970,11 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP22]]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP22]]
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]]
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 5d5ee570da0ff..63eb97a570745 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -911,6 +911,262 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
+
+define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @add_of_zext_outside_loop(
+; CHECK-INTERLEAVE1-SAME: i32 [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV1]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @add_of_zext_outside_loop(
+; CHECK-INTERLEAVED-SAME: i32 [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = sub i32 1024, [[D]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV1]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
+; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add nuw i32 [[VEC_PHI1]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP21]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define i32 @add_of_zext_outside_loop(
+; CHECK-MAXBW-SAME: i32 [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw i32 [[TMP1]], 16
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 16
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP5]], 16
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = insertelement <vscale x 16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV1]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW:       scalar.ph:
+;
+entry:
+  %conv1 = zext i8 %c to i32
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ %d, %entry ], [ %iv.next, %for.body ]
+  %rdx = phi i32 [ %a, %entry ], [ %rdx.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %b, i32 %iv
+  store i8 0, ptr %arrayidx, align 1
+  %rdx.next = add nsw i32 %rdx, %conv1
+  %iv.next = add nuw nsw i32 %iv, 1
+  %cmp = icmp eq i32 %iv.next, 1024
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+  ret i32 %add.lcssa
+}
+
+define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVE1-SAME: i32 [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVED-SAME: i32 [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = sub i32 1024, [[D]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
+; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define i32 @add_of_loop_invariant_zext(
+; CHECK-MAXBW-SAME: i32 [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw i32 [[TMP1]], 16
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 16
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul nuw i32 [[TMP5]], 16
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP8]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW:       scalar.ph:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ %d, %entry ], [ %iv.next, %for.body ]
+  %rdx = phi i32 [ %a, %entry ], [ %rdx.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %b, i32 %iv
+  store i8 0, ptr %arrayidx, align 1
+  %conv1 = zext i8 %c to i32
+  %rdx.next = add nsw i32 %rdx, %conv1
+  %iv.next = add nuw nsw i32 %iv, 1
+  %cmp = icmp eq i32 %iv.next, 1024
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+  ret i32 %add.lcssa
+}
+
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
index 4c29a3a0d1d01..0a751699a0549 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
 ; RUN: opt -mattr=+mve -passes=loop-vectorize < %s -S -o - | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
@@ -49,7 +49,7 @@ define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw <4 x i32> [[TMP10]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], splat (i32 16)
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i32> [[TMP12]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 32767)
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw <4 x i32> [[TMP5]], splat (i32 16762097)
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i32> [[TMP7]], splat (i32 16759568)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index e3f9540ff3df7..1f6d71ed16b46 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -36,10 +36,10 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP12]]
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP14]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -57,7 +57,7 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 0490d63f67d4e..e31d7ff4c1748 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -546,17 +546,17 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP:       [[MIDDLE_BLOCK]]:
 ; NO-VP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
 ; NO-VP-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
-; NO-VP-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 2
+; NO-VP-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
 ; NO-VP-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; NO-VP-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 4
-; NO-VP-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
+; NO-VP-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 2
 ; NO-VP-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP22]]
 ; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
 ; NO-VP:       [[SCALAR_PH]]:
 ; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
 ; NO-VP-NEXT:    br label %[[FOR_BODY:.*]]
 ; NO-VP:       [[FOR_BODY]]:
 ; NO-VP-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -570,7 +570,7 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
 ; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; NO-VP:       [[FOR_END]]:
-; NO-VP-NEXT:    [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; NO-VP-NEXT:    [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
 ; NO-VP-NEXT:    ret i32 [[FOR1_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 0a85548f8750b..d445e0d5ae7cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -416,8 +416,8 @@ define i16 @iv_and_step_trunc() {
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 0c5784b32fc9f..03c334b148321 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -23,10 +23,10 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
@@ -46,7 +46,7 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    ret i16 [[RES]]
 ;
@@ -92,10 +92,10 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
@@ -115,7 +115,7 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    ret i16 [[RES]]
 ;
@@ -164,12 +164,12 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
@@ -192,8 +192,8 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
 ; CHECK-NEXT:    ret i16 [[RES_2]]
@@ -349,12 +349,12 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
@@ -377,8 +377,8 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
 ; CHECK-NEXT:    ret i16 [[RES_2]]
@@ -431,12 +431,12 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
@@ -459,8 +459,8 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
 ; CHECK-NEXT:    ret i16 [[RES_2]]
@@ -513,12 +513,12 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
@@ -541,8 +541,8 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr)
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
 ; CHECK-NEXT:    ret i16 [[RES_2]]
@@ -594,12 +594,12 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
@@ -621,8 +621,8 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
 ; CHECK-NEXT:    ret i16 [[RES_2]]
@@ -673,10 +673,10 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 1.000000e+01, %[[ENTRY]] ]
@@ -697,7 +697,7 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) {
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    ret double [[RES]]
 ;
@@ -785,8 +785,8 @@ define i64 @test_first_order_recurrences_and_induction(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
@@ -846,8 +846,8 @@ define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -906,8 +906,8 @@ define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -973,8 +973,8 @@ define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1000
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -1046,8 +1046,8 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
@@ -1072,7 +1072,7 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi double [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi double [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[RES_1:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = fadd double [[RES_1]], [[FOR_3_LCSSA]]
 ; CHECK-NEXT:    ret double [[RES_2]]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
index 3bcb832b3fe38..86171e69bb4ac 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
@@ -88,8 +88,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
 ; CHECK-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
index b20d59bd5760e..f4044a759dad4 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
@@ -33,10 +33,10 @@ define i64 @pr97452_scalable_vf1_for_live_out(ptr %src) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP9]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP10]]
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -52,7 +52,7 @@ define i64 @pr97452_scalable_vf1_for_live_out(ptr %src) {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 17c2be64f1a31..ff5c29f494178 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -896,8 +896,8 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
@@ -985,8 +985,8 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
 ; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
@@ -1312,8 +1312,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -1378,8 +1378,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; SINK-AFTER-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
 ; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -3359,8 +3359,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
 ; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -3450,8 +3450,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
 ; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
new file mode 100644
index 0000000000000..148beb64a3609
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -0,0 +1,435 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+define float @fmin_olt_with_select_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmin_olt_with_select_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MIN_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp olt float %l, %min
+  %min.next = select i1 %cmp, float %l, float %min
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_olt_with_select_2(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmin_olt_with_select_2(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[MAX]], [[L]]
+; CHECK-NEXT:    [[MIN_NEXT]] = select i1 [[CMP]], float [[MAX]], float [[L]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp olt float %min, %l
+  %min.next = select i1 %cmp, float %min, float %l
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_ogt_with_select_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmin_ogt_with_select_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[L]], [[MIN]]
+; CHECK-NEXT:    [[MIN_NEXT]] = select i1 [[CMP]], float [[MIN]], float [[L]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MIN_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ogt float %l, %min
+  %min.next = select i1 %cmp, float %min, float %l
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_ogt_with_select_2(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmin_ogt_with_select_2(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[MIN]], [[L]]
+; CHECK-NEXT:    [[MIN_NEXT]] = select i1 [[CMP]], float [[L]], float [[MIN]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MIN_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ogt float %min, %l
+  %min.next = select i1 %cmp, float %l, float %min
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_olt_with_select_store_result(ptr %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define float @fmin_olt_with_select_store_result(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    store float [[MAX_NEXT]], ptr [[DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp olt float %l, %min
+  %min.next = select i1 %cmp, float %l, float %min
+  store float %min.next, ptr %dst, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fminnum_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fminnum_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %min.next = call float @llvm.minnum.f32(float %l, float %min)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fminnum_2(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fminnum_2(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %min.next = call float @llvm.minnum.f32(float %min, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_select_with_blend(ptr %A, ptr %B) {
+; CHECK-LABEL: define float @fmin_select_with_blend(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L_A]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_2:%.*]] = fcmp olt float [[MAX]], [[L]]
+; CHECK-NEXT:    [[MAX_SEL:%.*]] = select i1 [[C_2]], float [[MAX]], float [[L]]
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MIN_NEXT]] = phi float [ [[MAX_SEL]], %[[LOOP_THEN]] ], [ [[MAX]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %min = phi float [ 0.000000e+00, %entry ], [ %min.next, %loop.latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %l.A = load i32, ptr %gep.A
+  %c.1 = icmp eq i32 %l.A, 0
+  br i1 %c.1, label %loop.then, label %loop.latch
+
+loop.then:
+  %gep.B = getelementptr inbounds float, ptr %B, i64 %iv
+  %l = load float, ptr %gep.B
+  %c.2 = fcmp olt float %min, %l
+  %min.sel = select i1 %c.2, float %min, float %l
+  br label %loop.latch
+
+loop.latch:
+  %min.next = phi float [ %min.sel, %loop.then ], [ %min, %loop ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_with_select_and_load_store(ptr %src, ptr noalias %dst, i64 %n) {
+; CHECK-LABEL: define float @fmin_with_select_and_load_store(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[IV_1:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_DST_1]], align 4
+; CHECK-NEXT:    [[GEP_DST_0:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[L_2]], ptr [[GEP_DST_0]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp olt float %l, %min
+  %iv.1 = add i64 %iv, 1
+  %gep.dst.1 = getelementptr inbounds i32, ptr %dst, i64 %iv.1
+  %l.2 = load i32, ptr %gep.dst.1
+  %gep.dst.0 = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 %l.2, ptr %gep.dst.0
+  %min.next = select i1 %cmp, float %l, float %min
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_ugt_with_select_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmin_ugt_with_select_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp ugt float %l, %min
+  %min.next = select i1 %cmp, float %l, float %min
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
+
+define float @fmin_oge_with_select_1(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmin_oge_with_select_1(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[L]], [[MAX]]
+; CHECK-NEXT:    [[MAX_NEXT]] = select i1 [[CMP]], float [[L]], float [[MAX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ -1.000000e+07, %entry ], [ %min.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %cmp = fcmp oge float %l, %min
+  %min.next = select i1 %cmp, float %l, float %min
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %min.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index ebddca2294d9c..38e436e24c42a 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -627,8 +627,8 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
 ; NPGSO-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
-; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
 ; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
 ; NPGSO-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; NPGSO:       [[SCALAR_PH]]:
 ; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 508, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 2747895f06a7b..ce4270dc4b7fa 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -18,11 +18,9 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[PH:.*]], label %[[LOOP_1]]
 ; CHECK:       [[PH]]:
-; CHECK-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SRC_2:%.*]] = tail call noalias noundef dereferenceable_or_null(8) ptr @calloc(i64 1, i64 8)
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV_2_LCSSA]], 1
 ; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index f429677e3875e..23a3fc1606e88 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -61,16 +61,16 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = mul nuw i32 [[TMP25]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP26]], 2
+; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP26]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP27]]
 ; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = mul nuw i32 [[TMP28]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP29]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP29]], 2
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP30]]
 ; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4UF1:       [[SCALAR_PH]]:
-; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
 ; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF1:       [[SCALAR_BODY]]:
@@ -138,16 +138,16 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 2
+; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP35]]
 ; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul nuw i32 [[TMP36]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], 2
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP38]]
 ; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4UF2:       [[SCALAR_PH]]:
-; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
 ; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF2:       [[SCALAR_BODY]]:
@@ -678,17 +678,17 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
 ; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 2
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
 ; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4UF1:       [[SCALAR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-VF4UF1-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4UF1:       [[FOR_BODY]]:
 ;
@@ -727,17 +727,17 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
 ; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 2
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
 ; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
 ; CHECK-VF4UF2:       [[SCALAR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-VF4UF2-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[FOR_BODY]]:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 732214aa1449e..87b11dc77d417 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -961,8 +961,8 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[FOR_RESULT:%.+]]> = extract-penultimate-element ir<%for.1.next>
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1:%.+]]> = extract-last-element ir<%for.1.next>
+; CHECK-NEXT:   EMIT vp<[[FOR_RESULT:%.+]]> = extract-penultimate-element ir<%for.1.next>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
index 09a59de44c745..d55559d632019 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
@@ -62,12 +62,11 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    store <8 x i16> [[PREDPHI]], ptr [[DCT]], align 2, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    store <8 x i16> [[PREDPHI34]], ptr [[TMP0]], align 2, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    [[BIN_RDX35:%.*]] = or <8 x i16> [[PREDPHI34]], [[PREDPHI]]
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = sext <8 x i16> [[BIN_RDX35]] to <8 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP29:%.*]] = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[BIN_RDX35]])
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[TMP29]], [[VECTOR_BODY]] ], [ [[OR_15:%.*]], [[IF_END_15:%.*]] ]
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[OR_LCSSA]], 0
+; CHECK-NEXT:    [[OR_LCSSA_IN:%.*]] = phi i16 [ [[TMP29]], [[VECTOR_BODY]] ], [ [[OR_1551:%.*]], [[IF_END_15:%.*]] ]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i16 [[OR_LCSSA_IN]], 0
 ; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL]] to i32
 ; CHECK-NEXT:    ret i32 [[LNOT_EXT]]
 ; CHECK:       for.body:
@@ -514,8 +513,7 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK:       if.end.15:
 ; CHECK-NEXT:    [[STOREMERGE_15:%.*]] = phi i16 [ [[CONV28_15]], [[IF_ELSE_15]] ], [ [[CONV12_15]], [[IF_THEN_15]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_15]], ptr [[ARRAYIDX_15]], align 2
-; CHECK-NEXT:    [[OR_1551:%.*]] = or i16 [[OR_1450]], [[STOREMERGE_15]]
-; CHECK-NEXT:    [[OR_15]] = sext i16 [[OR_1551]] to i32
+; CHECK-NEXT:    [[OR_1551]] = or i16 [[OR_1450]], [[STOREMERGE_15]]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 9032c363eb936..9d613b8fe456d 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -41,7 +41,7 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <8 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP5]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP6]], <8 x i32> splat (i32 32767))
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc nsw <8 x i32> [[TMP7]] to <8 x i16>
 ; CHECK-NEXT:    store <8 x i16> [[TMP8]], ptr [[NEXT_GEP14]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index 5e3fd156666f5..410696260a855 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -16,12 +16,11 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
index 2b5ee59aeb163..96dd691c4816e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
@@ -5,24 +5,21 @@ define i16 @foo(i16 %in1, i16 %in2) {
 ; CHECK-LABEL: define i16 @foo(
 ; CHECK-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i64> [[TMP9]], splat (i64 65535)
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP12]], splat (i64 65533)
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    [[ZEXT1_1:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_1:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i64 [[ZEXT2_1]], [[ZEXT1_1]]
+; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[TMP10]], 65535
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[AND1]], 65533
 ; CHECK-NEXT:    [[ZEXT3_1:%.*]] = zext i1 [[TMP8]] to i16
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
 ; CHECK-NEXT:    [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    [[ZEXT1_2:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_2:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i64 [[ZEXT2_2]], [[ZEXT1_2]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[TMP13]], 65535
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[AND2]], 65533
 ; CHECK-NEXT:    [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[CMP2_2:%.*]] = icmp ne i64 [[TMP13]], 196605
 ; CHECK-NEXT:    [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
 ; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
similarity index 89%
rename from llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
index a1f4590a56919..04c69106d97ff 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 667fc41c069e1..10a17f7e3f9a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -645,20 +645,21 @@ define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo)
 ; CHECK-NEXT:    [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64>
 ; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8>
 ; CHECK-NEXT:    [[A1:%.*]] = and <16 x i8> [[BC1]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i8> [[A1]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[A1]], i64 8
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]]
 ; CHECK-NEXT:    [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64>
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8>
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i8> [[BC2]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <16 x i8> [[A2]], i64 0
+; CHECK-NEXT:    [[E4:%.*]] = extractelement <16 x i8> [[A2]], i64 8
 ; CHECK-NEXT:    [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]]
 ; CHECK-NEXT:    [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1)
 ; CHECK-NEXT:    store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 8, i32 24>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 0, i32 16>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O3]], 0
+; CHECK-NEXT:    [[O2:%.*]] = or i8 [[E4]], [[E3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i8 [[O3]], [[O2]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O4]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %l = load <2 x i64>, ptr %values, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 8d7d31ab98441..f397290299a4f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -7,54 +7,72 @@ define void @h(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV9:%.*]] = zext i16 [[A]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[E]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[I]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[M]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = or i32 [[CONV9]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[G]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[K]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> [[TMP7]], i16 [[O]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i16> [[TMP8]], i16 [[C]], i32 3
+; CHECK-NEXT:    [[CONV310:%.*]] = zext i16 [[B]] to i32
+; CHECK-NEXT:    [[ADD4:%.*]] = or i32 [[CONV310]], [[CONV9]]
+; CHECK-NEXT:    [[SUB:%.*]] = or i32 [[CONV9]], [[CONV310]]
+; CHECK-NEXT:    [[CONV15:%.*]] = sext i16 [[C]] to i32
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 0, 0
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i8, ptr null, i64 24
 ; CHECK-NEXT:    [[CONV19:%.*]] = sext i16 [[D]] to i32
 ; CHECK-NEXT:    [[SUB20:%.*]] = or i32 [[SHR]], [[CONV19]]
+; CHECK-NEXT:    [[SHR29:%.*]] = ashr i32 0, 0
+; CHECK-NEXT:    [[ADD30:%.*]] = or i32 [[SHR29]], [[CONV15]]
 ; CHECK-NEXT:    [[SUB39:%.*]] = or i32 [[SUB]], [[SUB20]]
 ; CHECK-NEXT:    [[CONV40:%.*]] = trunc i32 [[SUB39]] to i16
 ; CHECK-NEXT:    store i16 [[CONV40]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[SUB44:%.*]] = or i32 [[ADD4]], [[ADD30]]
+; CHECK-NEXT:    [[CONV45:%.*]] = trunc i32 [[SUB44]] to i16
+; CHECK-NEXT:    store i16 [[CONV45]], ptr [[ARRAYIDX18]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr i8, ptr null, i64 18
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
-; CHECK-NEXT:    [[ADD4_1:%.*]] = or i32 [[TMP11]], 0
+; CHECK-NEXT:    [[CONV3_112:%.*]] = zext i16 [[E]] to i32
+; CHECK-NEXT:    [[ADD4_1:%.*]] = or i32 [[CONV3_112]], 0
+; CHECK-NEXT:    [[SUB_1:%.*]] = or i32 0, [[CONV3_112]]
 ; CHECK-NEXT:    [[CONV15_1:%.*]] = sext i16 [[F]] to i32
+; CHECK-NEXT:    [[SHR_1:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_1:%.*]] = getelementptr i8, ptr null, i64 26
+; CHECK-NEXT:    [[CONV19_1:%.*]] = sext i16 [[G]] to i32
+; CHECK-NEXT:    [[SUB20_1:%.*]] = or i32 [[SHR_1]], [[CONV19_1]]
 ; CHECK-NEXT:    [[SHR29_1:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_1:%.*]] = or i32 [[SHR29_1]], [[CONV15_1]]
+; CHECK-NEXT:    [[SUB39_1:%.*]] = or i32 [[SUB_1]], [[SUB20_1]]
+; CHECK-NEXT:    [[CONV40_1:%.*]] = trunc i32 [[SUB39_1]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_1]], ptr [[ARRAYIDX2_1]], align 2
 ; CHECK-NEXT:    [[SUB44_1:%.*]] = or i32 [[ADD4_1]], [[ADD30_1]]
 ; CHECK-NEXT:    [[CONV45_1:%.*]] = trunc i32 [[SUB44_1]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_1]], ptr [[ARRAYIDX18_1]], align 2
 ; CHECK-NEXT:    [[CONV_213:%.*]] = zext i16 [[H]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr i8, ptr null, i64 20
+; CHECK-NEXT:    [[CONV3_214:%.*]] = zext i16 [[I]] to i32
 ; CHECK-NEXT:    [[ADD4_2:%.*]] = or i32 0, [[CONV_213]]
+; CHECK-NEXT:    [[SUB_2:%.*]] = or i32 0, [[CONV3_214]]
 ; CHECK-NEXT:    [[CONV15_2:%.*]] = sext i16 [[J]] to i32
+; CHECK-NEXT:    [[SHR_2:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_2:%.*]] = getelementptr i8, ptr null, i64 28
+; CHECK-NEXT:    [[CONV19_2:%.*]] = sext i16 [[K]] to i32
+; CHECK-NEXT:    [[SUB20_2:%.*]] = or i32 [[SHR_2]], [[CONV19_2]]
 ; CHECK-NEXT:    [[SHR29_2:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_2:%.*]] = or i32 [[SHR29_2]], [[CONV15_2]]
+; CHECK-NEXT:    [[SUB39_2:%.*]] = or i32 [[SUB_2]], [[SUB20_2]]
+; CHECK-NEXT:    [[CONV40_2:%.*]] = trunc i32 [[SUB39_2]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_2]], ptr [[ARRAYIDX2_2]], align 2
 ; CHECK-NEXT:    [[SUB44_2:%.*]] = or i32 [[ADD4_2]], [[ADD30_2]]
 ; CHECK-NEXT:    [[CONV45_2:%.*]] = trunc i32 [[SUB44_2]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_2]], ptr [[ARRAYIDX18_2]], align 2
 ; CHECK-NEXT:    [[CONV_315:%.*]] = zext i16 [[L]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr i8, ptr null, i64 22
+; CHECK-NEXT:    [[CONV3_316:%.*]] = zext i16 [[M]] to i32
 ; CHECK-NEXT:    [[ADD4_3:%.*]] = or i32 0, [[CONV_315]]
+; CHECK-NEXT:    [[SUB_3:%.*]] = or i32 0, [[CONV3_316]]
 ; CHECK-NEXT:    [[CONV15_3:%.*]] = sext i16 [[N]] to i32
+; CHECK-NEXT:    [[SHR_3:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_3:%.*]] = getelementptr i8, ptr null, i64 30
+; CHECK-NEXT:    [[CONV19_3:%.*]] = sext i16 [[O]] to i32
+; CHECK-NEXT:    [[SUB20_3:%.*]] = or i32 [[SHR_3]], [[CONV19_3]]
 ; CHECK-NEXT:    [[SHR29_3:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_3:%.*]] = or i32 [[SHR29_3]], [[CONV15_3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[A]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i16> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> zeroinitializer, [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i16> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[ARRAYIDX2_1]], align 2
+; CHECK-NEXT:    [[SUB39_3:%.*]] = or i32 [[SUB_3]], [[SUB20_3]]
+; CHECK-NEXT:    [[CONV40_3:%.*]] = trunc i32 [[SUB39_3]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_3]], ptr [[ARRAYIDX2_3]], align 2
 ; CHECK-NEXT:    [[SUB44_3:%.*]] = or i32 [[ADD4_3]], [[ADD30_3]]
 ; CHECK-NEXT:    [[CONV45_3:%.*]] = trunc i32 [[SUB44_3]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_3]], ptr [[ARRAYIDX18_3]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index fcbe2d631ba8b..3376fb8910b77 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; These tests check that we remove from consideration pairs of seed
@@ -26,7 +26,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '4'
+; YAML-NEXT:   - Cost:            '6'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -36,7 +36,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - Cost:            '8'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -51,35 +51,39 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD16:%.*]] = extractelement <2 x i32> [[TMP17:%.*]], i32 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[T12]], i32 0
+; CHECK-NEXT:    [[TMP17]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
@@ -129,7 +133,7 @@ for.body:
 ; YAML:      Function:        getelementptr_2x32
 ; YAML:     Args:
 ; YAML:        - String:          'SLP vectorized with cost '
-; YAML:        - Cost:            '4'
+; YAML:        - Cost:            '8'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -139,36 +143,42 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP13:%.*]], i32 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    [[T5:%.*]] = add nsw i32 [[T4]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[T5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP5]]
+; CHECK-NEXT:    [[T7:%.*]] = add nsw i32 [[T4]], 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T7]]
+; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
-; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP10]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[T12]], i32 0
+; CHECK-NEXT:    [[TMP13]] = add nsw <2 x i32> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 1826e19d8e73f..4847149c87a18 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -14,7 +14,7 @@
 ; YAML-NEXT:  Function:        test_i16_extend
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-20'
+; YAML-NEXT:    - Cost:            '-16
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '5'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 9f5744b17cb79..929fb29a4a679 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -600,15 +600,27 @@ bb15:                                             ; preds = %bb15, %bb14
 define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) {
 ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <2 x i32> <i32 10, i32 300>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float 1.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float 3.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP4:%.*]] = fptosi float [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 10 to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 2, [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[TMP20]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index 9562e6d41f7cd..fe3db7d462e8e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -14,168 +14,389 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-LABEL: @straight(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2
+; CHECK-NEXT:    [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]]
+; CHECK-NEXT:    [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]]
+; CHECK-NEXT:    [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2
+; CHECK-NEXT:    [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]]
+; CHECK-NEXT:    [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]]
+; CHECK-NEXT:    [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]]
+; CHECK-NEXT:    [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2
+; CHECK-NEXT:    [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2
+; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]]
+; CHECK-NEXT:    [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2
+; CHECK-NEXT:    [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]]
+; CHECK-NEXT:    [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2
+; CHECK-NEXT:    [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]]
+; CHECK-NEXT:    [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2
+; CHECK-NEXT:    [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]]
+; CHECK-NEXT:    [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2
+; CHECK-NEXT:    [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2
+; CHECK-NEXT:    [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]]
+; CHECK-NEXT:    [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]]
+; CHECK-NEXT:    [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]]
+; CHECK-NEXT:    [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2
+; CHECK-NEXT:    [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]]
+; CHECK-NEXT:    [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]]
+; CHECK-NEXT:    [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]]
+; CHECK-NEXT:    [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2
+; CHECK-NEXT:    [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]]
+; CHECK-NEXT:    [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2
+; CHECK-NEXT:    [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2
+; CHECK-NEXT:    [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]]
+; CHECK-NEXT:    [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2
+; CHECK-NEXT:    [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]]
+; CHECK-NEXT:    [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2
+; CHECK-NEXT:    [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]]
+; CHECK-NEXT:    [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2
+; CHECK-NEXT:    [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2
+; CHECK-NEXT:    [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]]
+; CHECK-NEXT:    [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]]
+; CHECK-NEXT:    [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]]
+; CHECK-NEXT:    [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2
+; CHECK-NEXT:    [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]]
+; CHECK-NEXT:    [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]]
+; CHECK-NEXT:    [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]]
+; CHECK-NEXT:    [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2
+; CHECK-NEXT:    [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]]
+; CHECK-NEXT:    [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2
+; CHECK-NEXT:    [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]]
+; CHECK-NEXT:    [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2
+; CHECK-NEXT:    [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2
+; CHECK-NEXT:    [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]]
+; CHECK-NEXT:    [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2
+; CHECK-NEXT:    [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]]
+; CHECK-NEXT:    [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2
+; CHECK-NEXT:    [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]]
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2
+; CHECK-NEXT:    [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]]
+; CHECK-NEXT:    [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]]
+; CHECK-NEXT:    [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]]
+; CHECK-NEXT:    [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2
+; CHECK-NEXT:    [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]]
+; CHECK-NEXT:    [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]]
+; CHECK-NEXT:    [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]]
+; CHECK-NEXT:    [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2
+; CHECK-NEXT:    [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]]
+; CHECK-NEXT:    [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3
+; CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2
+; CHECK-NEXT:    [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]]
+; CHECK-NEXT:    [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2
+; CHECK-NEXT:    [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]]
+; CHECK-NEXT:    [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5
+; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2
+; CHECK-NEXT:    [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6
+; CHECK-NEXT:    [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2
+; CHECK-NEXT:    [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]]
+; CHECK-NEXT:    [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7
+; CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2
+; CHECK-NEXT:    [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]]
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2
+; CHECK-NEXT:    [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]]
+; CHECK-NEXT:    [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]]
+; CHECK-NEXT:    [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]]
+; CHECK-NEXT:    [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2
+; CHECK-NEXT:    [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]]
+; CHECK-NEXT:    [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]]
+; CHECK-NEXT:    [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]]
+; CHECK-NEXT:    [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2
+; CHECK-NEXT:    [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2
+; CHECK-NEXT:    [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]]
+; CHECK-NEXT:    [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2
+; CHECK-NEXT:    [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]]
+; CHECK-NEXT:    [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2
+; CHECK-NEXT:    [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]]
+; CHECK-NEXT:    [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5
+; CHECK-NEXT:    [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2
+; CHECK-NEXT:    [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]]
+; CHECK-NEXT:    [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6
+; CHECK-NEXT:    [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2
+; CHECK-NEXT:    [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2
+; CHECK-NEXT:    [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]]
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2
+; CHECK-NEXT:    [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]]
+; CHECK-NEXT:    [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]]
+; CHECK-NEXT:    [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]]
+; CHECK-NEXT:    [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2
+; CHECK-NEXT:    [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]]
+; CHECK-NEXT:    [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]]
+; CHECK-NEXT:    [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]]
+; CHECK-NEXT:    [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2
+; CHECK-NEXT:    [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2
+; CHECK-NEXT:    [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]]
+; CHECK-NEXT:    [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3
+; CHECK-NEXT:    [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2
+; CHECK-NEXT:    [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]]
+; CHECK-NEXT:    [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2
+; CHECK-NEXT:    [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]]
+; CHECK-NEXT:    [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5
+; CHECK-NEXT:    [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2
+; CHECK-NEXT:    [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]]
+; CHECK-NEXT:    [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6
+; CHECK-NEXT:    [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2
+; CHECK-NEXT:    [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]]
+; CHECK-NEXT:    [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7
+; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2
+; CHECK-NEXT:    [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]]
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP83]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <64 x i16> [[TMP84]], <64 x i16> [[TMP85]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP88:%.*]] = shufflevector <64 x i16> [[TMP86]], <64 x i16> [[TMP87]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP88]], <64 x i16> [[TMP89]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]]
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]]
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]]
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]]
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]]
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]]
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]]
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]]
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]]
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]]
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]]
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]]
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]]
-; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56
-; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]]
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57
-; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58
-; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59
-; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60
-; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61
-; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62
-; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63
-; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]]
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2
+; CHECK-NEXT:    [[CONV_764:%.*]] = zext i16 [[TMP56]] to i32
+; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]]
+; CHECK-NEXT:    [[MUL_766:%.*]] = mul nuw nsw i32 [[CONV_764]], [[CONV_764]]
+; CHECK-NEXT:    [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]]
+; CHECK-NEXT:    [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2
+; CHECK-NEXT:    [[CONV_1_7:%.*]] = zext i16 [[TMP57]] to i32
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]]
+; CHECK-NEXT:    [[MUL_1_7:%.*]] = mul nuw nsw i32 [[CONV_1_7]], [[CONV_1_7]]
+; CHECK-NEXT:    [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]]
+; CHECK-NEXT:    [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2
+; CHECK-NEXT:    [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2
+; CHECK-NEXT:    [[CONV_2_7:%.*]] = zext i16 [[TMP58]] to i32
+; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]]
+; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul nuw nsw i32 [[CONV_2_7]], [[CONV_2_7]]
+; CHECK-NEXT:    [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]]
+; CHECK-NEXT:    [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3
+; CHECK-NEXT:    [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2
+; CHECK-NEXT:    [[CONV_3_7:%.*]] = zext i16 [[TMP59]] to i32
+; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]]
+; CHECK-NEXT:    [[MUL_3_7:%.*]] = mul nuw nsw i32 [[CONV_3_7]], [[CONV_3_7]]
+; CHECK-NEXT:    [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]]
+; CHECK-NEXT:    [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2
+; CHECK-NEXT:    [[CONV_4_7:%.*]] = zext i16 [[TMP60]] to i32
+; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]]
+; CHECK-NEXT:    [[MUL_4_7:%.*]] = mul nuw nsw i32 [[CONV_4_7]], [[CONV_4_7]]
+; CHECK-NEXT:    [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]]
+; CHECK-NEXT:    [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5
+; CHECK-NEXT:    [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2
+; CHECK-NEXT:    [[CONV_5_7:%.*]] = zext i16 [[TMP61]] to i32
+; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]]
+; CHECK-NEXT:    [[MUL_5_7:%.*]] = mul nuw nsw i32 [[CONV_5_7]], [[CONV_5_7]]
+; CHECK-NEXT:    [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]]
+; CHECK-NEXT:    [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6
+; CHECK-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2
+; CHECK-NEXT:    [[CONV_6_7:%.*]] = zext i16 [[TMP62]] to i32
+; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]]
+; CHECK-NEXT:    [[MUL_6_7:%.*]] = mul nuw nsw i32 [[CONV_6_7]], [[CONV_6_7]]
+; CHECK-NEXT:    [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]]
+; CHECK-NEXT:    [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2
+; CHECK-NEXT:    [[CONV_7_7:%.*]] = zext i16 [[TMP63]] to i32
+; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]]
+; CHECK-NEXT:    [[MUL_7_7:%.*]] = mul nuw nsw i32 [[CONV_7_7]], [[CONV_7_7]]
+; CHECK-NEXT:    [[ADD11_7_7:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]]
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP82]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[ADD11_7_7]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]
@@ -580,13 +801,101 @@ define i64 @looped(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2
+; CHECK-NEXT:    [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]]
+; CHECK-NEXT:    [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]]
+; CHECK-NEXT:    [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2
+; CHECK-NEXT:    [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]]
+; CHECK-NEXT:    [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]]
+; CHECK-NEXT:    [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]]
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2
+; CHECK-NEXT:    [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]]
+; CHECK-NEXT:    [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]]
+; CHECK-NEXT:    [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]]
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2
+; CHECK-NEXT:    [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]]
+; CHECK-NEXT:    [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]]
+; CHECK-NEXT:    [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]]
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2
+; CHECK-NEXT:    [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]]
+; CHECK-NEXT:    [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]]
+; CHECK-NEXT:    [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]]
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2
+; CHECK-NEXT:    [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]]
+; CHECK-NEXT:    [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]]
+; CHECK-NEXT:    [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]]
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2
+; CHECK-NEXT:    [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]]
+; CHECK-NEXT:    [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]]
+; CHECK-NEXT:    [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]]
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2
+; CHECK-NEXT:    [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[ADD_14]], [[CONV_15]]
+; CHECK-NEXT:    [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]]
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[MUL_15]], [[ADD11_14]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[INC13]] = add nuw nsw i32 [[Y_038]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16
diff --git a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
similarity index 94%
rename from llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
index 4478eab7b827a..15f4cffe77910 100644
--- a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
index afaf6b98e5081..094d60b66b393 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -90,14 +90,14 @@ entry:
 define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i64, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i64, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i64, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i64, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i64, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i64, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
@@ -131,14 +131,14 @@ entry:
 define void @splat_loads_i32(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i32, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i32, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i32, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i32, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i32> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
index aeb82d800a2f7..3c2f9e4d0ab5d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
@@ -4,17 +4,17 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_REAL32_PRE:%.*]] = load i32, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_IMAG33_PRE:%.*]] = load i32, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr null, i32 0, i32 1), align 1
 ; CHECK-NEXT:    br label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN:.*]]:
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP0]], %[[ENTRY]] ], [ poison, %[[IF_THEN]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[G_2197_IMAG33_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[G_2197_REAL32_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr null, align 1
 ; CHECK-NEXT:    br label %[[TRAP:.*]]
 ; CHECK:       [[BB3:.*:]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr null, align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[TRAP]]:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
index 3cb81b72d26a1..14ce08cb7aebe 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
@@ -6,19 +6,36 @@ define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) {
 ; CHECK-LABEL: define void @should_vectorize_gep
 ; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]]
+; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2
+; CHECK-NEXT:    [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2
+; CHECK-NEXT:    [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64
+; CHECK-NEXT:    [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]]
+; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2
+; CHECK-NEXT:    [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2
+; CHECK-NEXT:    [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2
+; CHECK-NEXT:    [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2
+; CHECK-NEXT:    [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64
+; CHECK-NEXT:    [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]]
+; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2
+; CHECK-NEXT:    [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64
+; CHECK-NEXT:    [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2
+; CHECK-NEXT:    [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64
+; CHECK-NEXT:    [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]]
 ; CHECK-NEXT:    call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index c728572313d77..cbf8bc9dcf8f8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -249,7 +249,7 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
 ; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index ce26bd3b89392..e800b5e016b74 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -31,3 +31,34 @@ define void @test() {
   store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
   ret void
 }
+
+; Same as above, but %a7 is also used as a scalar and must be extracted from
+; the wide load. (Or in this case, kept as a scalar load).
+define double @test_with_extract() {
+; CHECK-LABEL: @test_with_extract(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <4 x double> [[TMP4]], ptr @dst, align 8
+; CHECK-NEXT:    ret double [[A7]]
+;
+  %a0 = load double, ptr @src, align 8
+  %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+  %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+  %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+  %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+  %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+  %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+  %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+  %res1 = fsub fast double %a0, %a1
+  %res2 = fsub fast double %a2, %a3
+  %res3 = fsub fast double %a4, %a5
+  %res4 = fsub fast double %a6, %a7
+  store double %res1, ptr @dst, align 8
+  store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+  store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+  store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+  ret double %a7
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
new file mode 100644
index 0000000000000..f90456297d7cb
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i32 [ 0, [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10]] ], [ zeroinitializer, [[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[PHI7]], i32 0
+; CHECK-NEXT:    switch i32 0, label [[BB16:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB14:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB11:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB11]]
+; CHECK:       bb10:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb11:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, [[BB9:%.*]] ], [ [[TMP1]], [[BB1]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb14:
+; CHECK-NEXT:    ret void
+; CHECK:       bb15:
+; CHECK-NEXT:    ret void
+; CHECK:       bb16:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ], [ poison, [[BB25:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb25:
+; CHECK-NEXT:    switch i32 0, label [[BB16]] [
+; CHECK-NEXT:      i32 0, label [[BB14]]
+; CHECK-NEXT:      i32 1, label [[BB15:%.*]]
+; CHECK-NEXT:    ]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi2 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi3 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi4 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi5 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi6 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi7 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi8 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb11
+  ]
+
+bb9:
+  br label %bb11
+
+bb10:
+  br label %bb1
+
+bb11:
+  %phi12 = phi i32 [ 0, %bb9 ], [ %phi7, %bb1 ]
+  %phi13 = phi i32 [ 0, %bb9 ], [ undef, %bb1 ]
+  ret void
+
+bb14:
+  ret void
+
+bb15:
+  ret void
+
+bb16:
+  %phi17 = phi i32 [ %phi, %bb1 ], [ 0, %bb25 ]
+  %phi18 = phi i32 [ %phi2, %bb1 ], [ 0, %bb25 ]
+  %phi19 = phi i32 [ %phi3, %bb1 ], [ 0, %bb25 ]
+  %phi20 = phi i32 [ %phi4, %bb1 ], [ 0, %bb25 ]
+  %phi21 = phi i32 [ %phi5, %bb1 ], [ 0, %bb25 ]
+  %phi22 = phi i32 [ %phi6, %bb1 ], [ 0, %bb25 ]
+  %phi23 = phi i32 [ %phi7, %bb1 ], [ 0, %bb25 ]
+  %phi24 = phi i32 [ %phi8, %bb1 ], [ 0, %bb25 ]
+  ret void
+
+bb25:
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb15
+  ]
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
similarity index 91%
rename from llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
rename to llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
index 002b9a70255da..278e55c67f23f 100644
--- a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %sptr, i64 %0) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
index f56af934f19f5..b1864b43512d8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll
@@ -14,50 +14,44 @@ define i32 @test(i32 %s.0) {
 ; CHECK:       [[IF_END3:.*]]:
 ; CHECK-NEXT:    br label %[[IF_END6:.*]]
 ; CHECK:       [[IF_END6]]:
-; CHECK-NEXT:    [[J_4:%.*]] = phi i32 [ 0, %[[IF_END3]] ], [ [[TMP28:%.*]], %[[O]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP22:%.*]], %[[O]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP24:%.*]], %[[O]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP29:%.*]], %[[O]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP22:%.*]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x i32> [[TMP27]], <8 x i32> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    br i1 false, label %[[IF_END24:.*]], label %[[IF_THEN11:.*]]
 ; CHECK:       [[IF_THEN11]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[J_4]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, <8 x i32> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 14>
 ; CHECK-NEXT:    br label %[[IF_END24]]
 ; CHECK:       [[IF_THEN18:.*]]:
 ; CHECK-NEXT:    br label %[[T]]
 ; CHECK:       [[T]]:
-; CHECK-NEXT:    [[TMP34:%.*]] = phi <8 x i32> [ [[TMP33:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <8 x i32> [ [[TMP33:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
 ; CHECK-NEXT:    [[TMP17]] = extractelement <4 x i32> [[TMP23:%.*]], i32 0
 ; CHECK-NEXT:    br i1 false, label %[[IF_END24]], label %[[K]]
 ; CHECK:       [[IF_END24]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP31]], %[[IF_END6]] ], [ [[TMP34]], %[[T]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> <i32 7, i32 1>
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <8 x i32> [ [[TMP12]], %[[IF_THEN11]] ], [ [[TMP31]], %[[IF_END6]] ], [ [[TMP13]], %[[T]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> <i32 6, i32 1>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 7>
 ; CHECK-NEXT:    br label %[[O]]
 ; CHECK:       [[O]]:
-; CHECK-NEXT:    [[TMP22]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ]
 ; CHECK-NEXT:    [[TMP23]] = phi <4 x i32> [ [[TMP1]], %[[K]] ], [ [[TMP20]], %[[IF_END24]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP21]], %[[IF_END24]] ]
+; CHECK-NEXT:    [[TMP24]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP34]], %[[IF_END24]] ]
+; CHECK-NEXT:    [[TMP22]] = extractelement <2 x i32> [[TMP24]], i32 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP25]], <8 x i32> <i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP33]] = shufflevector <8 x i32> [[TMP26]], <8 x i32> [[TMP32]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP28]] = extractelement <4 x i32> [[TMP24]], i32 3
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <2 x i32> [[TMP24]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP21]], <4 x i32> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP29]] = shufflevector <2 x i32> [[TMP35]], <2 x i32> [[TMP28]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    br i1 false, label %[[T]], label %[[IF_END6]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
new file mode 100644
index 0000000000000..0dac02b0bcc09
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[B:%.*]], ptr [[C:%.*]], i32 [[TMP0:%.*]], ptr [[A:%.*]], i1 [[TOBOOL3_NOT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[TOBOOL3_NOT]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], splat (i32 16)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL3_NOT]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <4 x i32> [[TMP13]], splat (i32 16)
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16>
+; CHECK-NEXT:    br i1 true, label [[BB3]], label [[BB2]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i16> [ [[TMP5]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
+; CHECK-NEXT:    store i32 [[TMP18]], ptr [[B]], align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP19]] to i32
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[C]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[B]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br i1 %tobool3.not, label %bb1, label %bb2
+
+bb1:
+  %conv1.i.us = ashr i32 %0, 16
+  %cmp2.i.us = icmp slt i32 %conv1.i.us, %0
+  %sext26.us = zext i1 %cmp2.i.us to i32
+  %conv1.i.us.5 = ashr i32 %0, 16
+  %cmp2.i.us.5 = icmp slt i32 %conv1.i.us.5, %0
+  %sext26.us.5 = zext i1 %cmp2.i.us.5 to i32
+  %conv1.i.us.6 = ashr i32 %0, 16
+  %cmp2.i.us.6 = icmp slt i32 %conv1.i.us.6, %0
+  %sext26.us.6 = zext i1 %cmp2.i.us.6 to i32
+  %conv1.i.us.7 = ashr i32 %0, 16
+  %cmp2.i.us.7 = icmp slt i32 %conv1.i.us.7, %0
+  %sext26.us.7 = zext i1 %cmp2.i.us.7 to i32
+  br label %bb3
+
+bb2:
+  %cmp2.i = icmp sgt i32 %0, 0
+  %1 = zext i1 %cmp2.i to i32
+  %cond.i = select i1 %tobool3.not, i32 %0, i32 %1
+  %sext26 = shl i32 %cond.i, 16
+  %conv13 = ashr i32 %sext26, 16
+  %cmp2.i.5 = icmp sgt i32 %0, 0
+  %2 = zext i1 %cmp2.i.5 to i32
+  %cond.i.5 = select i1 %tobool3.not, i32 %0, i32 %2
+  %sext26.5 = shl i32 %cond.i.5, 16
+  %conv13.5 = ashr i32 %sext26.5, 16
+  %cmp2.i.6 = icmp sgt i32 %0, 0
+  %3 = zext i1 %cmp2.i.6 to i32
+  %cond.i.6 = select i1 %tobool3.not, i32 %0, i32 %3
+  %sext26.6 = shl i32 %cond.i.6, 16
+  %conv13.6 = ashr i32 %sext26.6, 16
+  %cmp2.i.7 = icmp sgt i32 %0, 0
+  %4 = zext i1 %cmp2.i.7 to i32
+  %cond.i.7 = select i1 %tobool3.not, i32 %0, i32 %4
+  %sext26.7 = shl i32 %cond.i.7, 16
+  %conv13.7 = ashr i32 %sext26.7, 16
+  br i1 true, label %bb3, label %bb2
+
+bb3:
+  %conv13p = phi i32 [ %sext26.us, %bb1 ], [ %conv13, %bb2 ]
+  %conv13.5p = phi i32 [ %sext26.us.5, %bb1 ], [ %conv13.5, %bb2 ]
+  %conv13.6p = phi i32 [ %sext26.us.6, %bb1 ], [ %conv13.6, %bb2 ]
+  %conv13.7p = phi i32 [ %sext26.us.7, %bb1 ], [ %conv13.7, %bb2 ]
+  store i32 %conv13p, ptr %b, align 16
+  store i32 %conv13.5p, ptr %a, align 8
+  store i32 %conv13.6p, ptr %c, align 16
+  store i32 %conv13.7p, ptr %b, align 8
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
index 9b6511d0d8284..d880c6b1783c8 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
@@ -17,12 +17,12 @@ define <2 x i32> @test(i32 %arg) {
 ; AARCH64-LABEL: define <2 x i32> @test(
 ; AARCH64-SAME: i32 [[ARG:%.*]]) {
 ; AARCH64-NEXT:  bb:
-; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ARG]], i32 0
-; AARCH64-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer
-; AARCH64-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; AARCH64-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP2:%.*]] = or i32 [[ARG]], 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = mul i32 0, 1
 ; AARCH64-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]]
 ; AARCH64-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; AARCH64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP3]], i32 1
 ; AARCH64-NEXT:    ret <2 x i32> [[TMP1]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SafeStack/NVPTX/lit.local.cfg b/llvm/test/Transforms/SafeStack/NVPTX/lit.local.cfg
new file mode 100644
index 0000000000000..0d37b86e1c8e6
--- /dev/null
+++ b/llvm/test/Transforms/SafeStack/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "NVPTX" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SafeStack/NVPTX/safestack-libcall-error.ll b/llvm/test/Transforms/SafeStack/NVPTX/safestack-libcall-error.ll
new file mode 100644
index 0000000000000..a17a9dd867e3a
--- /dev/null
+++ b/llvm/test/Transforms/SafeStack/NVPTX/safestack-libcall-error.ll
@@ -0,0 +1,12 @@
+; RUN: not opt -disable-output -mtriple=nvptx64-- -mcpu=sm_90 -passes=safe-stack %s 2>&1 | FileCheck %s
+
+; CHECK: error: no libcall available for stackprotector check fail
+define void @foo(i32 %t) #0 {
+  %vla = alloca i32, i32 %t, align 4
+  call void @baz(ptr %vla)
+  ret void
+}
+
+declare void @baz(ptr)
+
+attributes #0 = { nounwind safestack sspstrong }
diff --git a/llvm/test/Transforms/SafeStack/NVPTX/safestack-pointer-address-libcall-error.ll b/llvm/test/Transforms/SafeStack/NVPTX/safestack-pointer-address-libcall-error.ll
new file mode 100644
index 0000000000000..f41dcff76b248
--- /dev/null
+++ b/llvm/test/Transforms/SafeStack/NVPTX/safestack-pointer-address-libcall-error.ll
@@ -0,0 +1,12 @@
+; RUN: not opt -disable-output -mtriple=nvptx64-- -safestack-use-pointer-address -mcpu=sm_90 -passes=safe-stack %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+; ERR: error: no libcall available for safestack pointer address
+define void @foo(i32 %t) #0 {
+  %vla = alloca i32, i32 %t, align 4
+  call void @baz(ptr %vla)
+  ret void
+}
+
+declare void @baz(ptr)
+
+attributes #0 = { nounwind safestack sspstrong }
diff --git a/llvm/test/Transforms/SimplifyCFG/ARM/phi-eliminate.ll b/llvm/test/Transforms/SimplifyCFG/ARM/phi-eliminate.ll
index 8a36226851592..f23e27f6d6686 100644
--- a/llvm/test/Transforms/SimplifyCFG/ARM/phi-eliminate.ll
+++ b/llvm/test/Transforms/SimplifyCFG/ARM/phi-eliminate.ll
@@ -241,13 +241,17 @@ define i64 @test_i64(i1 %a, i1 %b, i64 %i, i64 %j, i64 %k) {
 ;
 ; CHECK-V8A-TWO-FOLD-6-LABEL: @test_i64(
 ; CHECK-V8A-TWO-FOLD-6-NEXT:  entry:
+; CHECK-V8A-TWO-FOLD-6-NEXT:    br i1 [[A:%.*]], label [[M:%.*]], label [[O:%.*]]
+; CHECK-V8A-TWO-FOLD-6:       O:
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IAJ:%.*]] = add i64 [[I:%.*]], [[J:%.*]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IAJAK:%.*]] = add i64 [[IAJ]], [[K:%.*]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IXJ:%.*]] = xor i64 [[I]], [[J]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IXJXK:%.*]] = xor i64 [[IXJ]], [[K]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[WP:%.*]] = select i1 [[B:%.*]], i64 [[IAJAK]], i64 [[IXJXK]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[WP2:%.*]] = add i64 [[WP]], [[WP]]
-; CHECK-V8A-TWO-FOLD-6-NEXT:    [[W:%.*]] = select i1 [[A:%.*]], i64 2, i64 [[WP2]]
+; CHECK-V8A-TWO-FOLD-6-NEXT:    br label [[M]]
+; CHECK-V8A-TWO-FOLD-6:       M:
+; CHECK-V8A-TWO-FOLD-6-NEXT:    [[W:%.*]] = phi i64 [ [[WP2]], [[O]] ], [ 2, [[ENTRY:%.*]] ]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[R:%.*]] = add i64 [[W]], 1
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    ret i64 [[R]]
 ;
@@ -374,13 +378,17 @@ define i64 @test_i64_minsize(i1 %a, i1 %b, i64 %i, i64 %j, i64 %k) #0 {
 ;
 ; CHECK-V8A-TWO-FOLD-6-LABEL: @test_i64_minsize(
 ; CHECK-V8A-TWO-FOLD-6-NEXT:  entry:
+; CHECK-V8A-TWO-FOLD-6-NEXT:    br i1 [[A:%.*]], label [[M:%.*]], label [[O:%.*]]
+; CHECK-V8A-TWO-FOLD-6:       O:
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IAJ:%.*]] = add i64 [[I:%.*]], [[J:%.*]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IAJAK:%.*]] = add i64 [[IAJ]], [[K:%.*]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IXJ:%.*]] = xor i64 [[I]], [[J]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[IXJXK:%.*]] = xor i64 [[IXJ]], [[K]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[WP:%.*]] = select i1 [[B:%.*]], i64 [[IAJAK]], i64 [[IXJXK]]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[WP2:%.*]] = add i64 [[WP]], [[WP]]
-; CHECK-V8A-TWO-FOLD-6-NEXT:    [[W:%.*]] = select i1 [[A:%.*]], i64 2, i64 [[WP2]]
+; CHECK-V8A-TWO-FOLD-6-NEXT:    br label [[M]]
+; CHECK-V8A-TWO-FOLD-6:       M:
+; CHECK-V8A-TWO-FOLD-6-NEXT:    [[W:%.*]] = phi i64 [ [[WP2]], [[O]] ], [ 2, [[ENTRY:%.*]] ]
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    [[R:%.*]] = add i64 [[W]], 1
 ; CHECK-V8A-TWO-FOLD-6-NEXT:    ret i64 [[R]]
 ;
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
index 32581bbf8f141..d2d917de11897 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
@@ -199,3 +199,44 @@ exit:
   %ret = phi i64 [ 0, %default ], [ 0, %bb1 ], [ 1, %entry ], [ 1, %bb2 ]
   ret i64 %ret
 }
+
+define i32 @switch_dup_unbounded_predecessors(i32 %val) {
+; SIMPLIFY-CFG-LABEL: define i32 @switch_dup_unbounded_predecessors(
+; SIMPLIFY-CFG-SAME: i32 [[VAL:%.*]]) {
+; SIMPLIFY-CFG-NEXT:  [[ENTRY:.*]]:
+; SIMPLIFY-CFG-NEXT:    switch i32 [[VAL]], label %[[EXIT:.*]] [
+; SIMPLIFY-CFG-NEXT:      i32 99, label %[[BB1:.*]]
+; SIMPLIFY-CFG-NEXT:      i32 115, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 102, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 70, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 101, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 69, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:      i32 103, label %[[BB1]]
+; SIMPLIFY-CFG-NEXT:    ]
+; SIMPLIFY-CFG:       [[BB1]]:
+; SIMPLIFY-CFG-NEXT:    br label %[[EXIT]]
+; SIMPLIFY-CFG:       [[EXIT]]:
+; SIMPLIFY-CFG-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[BB1]] ]
+; SIMPLIFY-CFG-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  switch i32 %val, label %exit [
+  i32 99, label %bb1
+  i32 115, label %bb1
+  i32 102, label %bb2
+  i32 70, label %bb2
+  i32 101, label %bb2
+  i32 69, label %bb2
+  i32 103, label %bb2
+  ]
+
+bb1:
+  br label %exit
+
+bb2:
+  br label %exit
+
+exit:
+  %phi = phi i32 [ 0, %entry ], [ 1, %bb1 ], [ 1, %bb2 ]
+  ret i32 %phi
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 4136f33983a2b..8f2ae2d054f1e 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -149,7 +149,7 @@ unreach2:
 
 define void @pr53208_single_reachable_dest(i8 %sw, ptr %p0) {
 ; CHECK-LABEL: @pr53208_single_reachable_dest(
-; CHECK-NEXT:  group2:
+; CHECK-NEXT:  exit:
 ; CHECK-NEXT:    call void @bar(ptr [[P0:%.*]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
new file mode 100644
index 0000000000000..9a102768b1277
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
@@ -0,0 +1,34 @@
+;; This is a minimal reproducer that caused StackProtector to crash with a bad cast when
+;; CrossDSOCFI is used. This test just needs to not crash.
+; REQUIRES: x86-registered-target
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=lowertypetests,cross-dso-cfi,stack-protector
+
+define hidden void @__stack_chk_fail() !type !1{
+  unreachable
+}
+
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+define void @func(ptr %0) {
+entry:
+  %1 = call i1 @llvm.type.test(ptr %0, metadata !"typeid")
+  br i1 %1, label %cont, label %trap
+
+trap:                                             ; preds = %entry
+  call void @llvm.trap()
+  unreachable
+
+cont:                                             ; preds = %entry
+  call void %0()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"Cross-DSO CFI", i32 1}
+!1 = !{i64 0, !"typeid"}
diff --git a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
new file mode 100644
index 0000000000000..def3e014797de
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
@@ -0,0 +1,22 @@
+;; __stack_chk_fail should have the noreturn attr even if it is an alias
+; REQUIRES: x86-registered-target
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=stack-protector -S | FileCheck %s
+
+define hidden void @__stack_chk_fail_impl() {
+  unreachable
+}
+
+@__stack_chk_fail = hidden alias void (), ptr @__stack_chk_fail_impl
+
+; CHECK-LABEL: @store_captures(
+; CHECK:       CallStackCheckFailBlk:
+; CHECK-NEXT:      call void @__stack_chk_fail() [[ATTRS:#.*]]
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+; CHECK: attributes [[ATTRS]] = { noreturn }
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index e6e5f5196d3da..5c035d29a7ea2 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -669,10 +669,10 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) {
 ; Scalarizing the load for multiple constant indices may not be profitable.
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -686,10 +686,10 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; because the vector large vector requires 2 vector registers.
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
-; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[TMP1]], i32 0, i32 6
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <8 x i32>, ptr %x, align 16
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index bd6e37c848d8c..143cc3817bd08 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -63,7 +63,7 @@
 def get_asan_rtlib():
     if (
         not "Address" in config.llvm_use_sanitizer
-        or not "Darwin" in config.host_os
+        or not "Darwin" in config.target_os
         or not "x86" in config.host_triple
     ):
         return ""
@@ -93,6 +93,13 @@ def get_asan_rtlib():
 config.substitutions.append(("%exeext", config.llvm_exe_ext))
 config.substitutions.append(("%llvm_src_root", config.llvm_src_root))
 
+# Add IR2Vec test vocabulary path substitution
+config.substitutions.append(
+    (
+        "%ir2vec_test_vocab_dir",
+        os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"),
+    )
+)
 
 lli_args = []
 # The target triple used by default by lli is the process target triple (some
@@ -197,6 +204,7 @@ def get_asan_rtlib():
         "llvm-dlltool",
         "llvm-exegesis",
         "llvm-extract",
+        "llvm-ir2vec",
         "llvm-isel-fuzzer",
         "llvm-ifs",
         "llvm-install-name-tool",
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index caee6c1db92ee..ee76beb51cce6 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -26,7 +26,7 @@ config.enable_assertions = @ENABLE_ASSERTIONS@
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.native_target = "@LLVM_NATIVE_ARCH@"
 config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
-config.host_os = "@HOST_OS@"
+config.target_os = "@HOST_OS@"
 config.host_cc = "@HOST_CC@"
 config.host_cxx = "@HOST_CXX@"
 # Note: ldflags can contain double-quoted paths, so must use single quotes here.
diff --git a/llvm/test/tools/llc/new-pm/start-stop-inserted.ll b/llvm/test/tools/llc/new-pm/start-stop-inserted.ll
new file mode 100644
index 0000000000000..ce5ad2d9e5065
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/start-stop-inserted.ll
@@ -0,0 +1,15 @@
+; REQUIRES: amdgpu-registered-target
+
+; AMDGPU inserts the fourth instance of dead-mi-elimination pass after detect-dead-lanes
+; This checks that the pipeline stops before that.
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -enable-new-pm -stop-before=dead-mi-elimination,4 --print-pipeline-passes -filetype=null %s | FileCheck %s
+
+; There is no way to -start-after an inserted pass right now.
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -O3 -enable-new-pm -start-after=dead-mi-elimination,4 --print-pipeline-passes -filetype=null %s
+
+
+; CHECK: dead-mi-elimination
+; CHECK: dead-mi-elimination
+; CHECK: dead-mi-elimination
+; CHECK-NOT: dead-mi-elimination
diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll
index 13d9663221115..e4c454900fd38 100644
--- a/llvm/test/tools/llc/new-pm/start-stop.ll
+++ b/llvm/test/tools/llc/new-pm/start-stop.ll
@@ -2,4 +2,4 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ
 
 ; NULL: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify)
-; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),invalidate<machine-function-info>)
+; OBJ: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),free-machine-function)
diff --git a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
index a4350fc6dc2cb..3ef664f899551 100644
--- a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
+++ b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
@@ -3,7 +3,7 @@ REQUIRES: aarch64-registered-target
 ## PPR Register Class Initialization Testcase
 ## Ideally, we should use PTRUE_{B/H/S/D} instead of FADDV_VPZ_D for an isolated test case; 
 ## However, exegesis does not yet support PTRUE_{B/H/S/D}.
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=PPR_ASM < %t.s
 PPR_ASM:            <foo>:
@@ -14,7 +14,7 @@ PPR_ASM-NEXT:       faddv d{{[0-9]+}}, p{{[0-9]+}}, z{{[0-9]+}}
 ## ZPR Register Class Initialization Testcase
 ## Ideally, we should use DUP_ZI_{B/H/S/D} instead of FADDV_VPZ_D for an isolated test case; 
 ## However, exegesis does not yet support DUP_ZI_{B/H/S/D}.
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=ZPR_ASM < %t.s
 ZPR_ASM:            <foo>:
@@ -23,7 +23,7 @@ ZPR_ASM-NEXT:       mov z{{[0-9]+}}.d, #0x0
 ZPR_ASM-NEXT:       faddv d{{[0-9]+}}, p{{[0-9]+}}, z{{[0-9]+}}
 
 ## FPR128 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv16i8v --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv16i8v --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR128-ASM < %t.s
 FPR128-ASM:         <foo>:
@@ -31,7 +31,7 @@ FPR128-ASM:         movi v{{[0-9]+}}.2d, #0000000000000000
 FPR128-ASM-NEXT:    addv b{{[0-9]+}}, v{{[0-9]+}}.16b
 
 ## FPR64 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv4i16v --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv4i16v --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR64-ASM < %t.s
 FPR64-ASM:          <foo>:
@@ -39,7 +39,7 @@ FPR64-ASM:          movi d{{[0-9]+}}, #0000000000000000
 FPR64-ASM-NEXT:     addv h{{[0-9]+}}, v{{[0-9]+}}.4h
 
 ## FPR32 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSSr --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSSr --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR32-ASM < %t.s
 FPR32-ASM:         <foo>:
@@ -48,7 +48,7 @@ FPR32-ASM-NEXT:    fabs s{{[0-9]+}}, s{{[0-9]+}}
 
 
 ## FPR16 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSHr --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSHr --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR16-ASM < %t.s
 FPR16-ASM:         <foo>:
@@ -56,7 +56,7 @@ FPR16-ASM:         movi d{{[0-9]+}}, #0000000000000000
 FPR16-ASM-NEXT:    fabs h{{[0-9]+}}, h{{[0-9]+}}
 
 ## FPR8 Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=SQABSv1i8 --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=SQABSv1i8 --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPR8-ASM < %t.s
 FPR8-ASM:         <foo>:
@@ -65,7 +65,7 @@ FPR8-ASM-NEXT:    sqabs   b{{[0-9]+}}, b{{[0-9]+}}
 
 
 ## FPCR Register Class Initialization Testcase
-RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=BFCVT --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=BFCVT --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s
 RUN: FileCheck %s --check-prefix=FPCR-ASM < %t.s
 FPCR-ASM:         <foo>:
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll b/llvm/test/tools/llvm-ir2vec/embeddings.ll
new file mode 100644
index 0000000000000..993ea865170f9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec --mode=embeddings --level=func --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00  889.00  900.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00  98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4 [ 97.00  98.00  99.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %0 = load i32, ptr %a.addr, align 4 [ 94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %1 = load i32, ptr %a.addr, align 4 [ 94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %mul = mul nsw i32 %0, %1 [ 49.00  50.00  51.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %conv = sitofp i32 %mul to float [ 130.00  131.00  132.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %2 = load float, ptr %b.addr, align 4 [ 94.00  95.00  96.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %add = fadd float %conv, %2 [ 40.00  41.00  42.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: ret float %add [ 1.00  2.00  3.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll b/llvm/test/tools/llvm-ir2vec/triplets.ll
new file mode 100644
index 0000000000000..d1ef5b388e258
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-ir2vec --mode=triplets %s | FileCheck %s -check-prefix=TRIPLETS
+
+define i32 @simple_add(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %a, %b
+  ret i32 %add
+}
+
+define i32 @simple_mul(i32 %x, i32 %y) {
+entry:
+  %mul = mul i32 %x, %y
+  ret i32 %mul
+}
+
+define i32 @test_function(i32 %arg1, i32 %arg2) {
+entry:
+  %local1 = alloca i32, align 4
+  %local2 = alloca i32, align 4
+  store i32 %arg1, ptr %local1, align 4
+  store i32 %arg2, ptr %local2, align 4
+  %load1 = load i32, ptr %local1, align 4
+  %load2 = load i32, ptr %local2, align 4
+  %result = add i32 %load1, %load2
+  ret i32 %result
+}
+
+; TRIPLETS: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Mul IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Alloca PointerTy Constant
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Store VoidTy Variable Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Load IntegerTy Pointer
+; TRIPLETS-NEXT: Add IntegerTy Variable Variable
+; TRIPLETS-NEXT: Ret VoidTy Variable
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
index bc9229471b20e..8838c862e6b75 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s
@@ -107,6 +107,9 @@ amomaxu.d.aqrl s5, s4, (s3)
 # CHECK-NEXT: [2]   - SMX60_IEUA:1
 # CHECK-NEXT: [3]   - SMX60_IEUB:1
 # CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -215,98 +218,101 @@ amomaxu.d.aqrl s5, s4, (s3)
 # CHECK-NEXT: [2]   - SMX60_IEUB
 # CHECK-NEXT: [3.0] - SMX60_LS
 # CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]
-# CHECK-NEXT:  -      -      -     44.00  44.00
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -      -      -     44.00  44.00   -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  Instructions:
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w	t0, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w.aq	t1, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w.rl	t2, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.w.aqrl	t3, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w	t6, t5, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w.aq	t5, t4, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w.rl	t4, t3, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.w.aqrl	t3, t2, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d	t0, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d.aq	t1, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d.rl	t2, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lr.d.aqrl	t3, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d	t6, t5, (t4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d.aq	t5, t4, (t3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d.rl	t4, t3, (t2)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sc.d.aqrl	t3, t2, (t1)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w.aq	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w.aq	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w.aq	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w.aq	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w.aq	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w.aq	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w.aq	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w.aq	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w.aq	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w.rl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w.rl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w.rl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w.rl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w.rl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w.rl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w.rl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w.rl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w.rl	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.w.aqrl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.w.aqrl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.w.aqrl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.w.aqrl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.w.aqrl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.w.aqrl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.w.aqrl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.w.aqrl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.w.aqrl	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d.aq	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d.aq	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d.aq	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d.aq	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d.aq	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d.aq	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d.aq	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d.aq	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d.aq	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d.rl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d.rl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d.rl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d.rl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d.rl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d.rl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d.rl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d.rl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d.rl	s5, s4, (s3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoswap.d.aqrl	a4, ra, (s0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoadd.d.aqrl	a1, a2, (a3)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoxor.d.aqrl	a2, a3, (a4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoand.d.aqrl	a3, a4, (a5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amoor.d.aqrl	a4, a5, (a6)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomin.d.aqrl	a5, a6, (a7)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomax.d.aqrl	s7, s6, (s5)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amominu.d.aqrl	s6, s5, (s4)
-# CHECK-NEXT:  -      -      -     0.50   0.50   amomaxu.d.aqrl	s5, s4, (s3)
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w	t0, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w.aq	t1, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w.rl	t2, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.w.aqrl	t3, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w	t6, t5, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w.aq	t5, t4, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w.rl	t4, t3, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.w.aqrl	t3, t2, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d	t0, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d.aq	t1, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d.rl	t2, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lr.d.aqrl	t3, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d	t6, t5, (t4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d.aq	t5, t4, (t3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d.rl	t4, t3, (t2)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sc.d.aqrl	t3, t2, (t1)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w.aq	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w.aq	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w.aq	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w.aq	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w.aq	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w.aq	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w.aq	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w.aq	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w.aq	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w.rl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w.rl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w.rl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w.rl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w.rl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w.rl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w.rl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w.rl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w.rl	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.w.aqrl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.w.aqrl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.w.aqrl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.w.aqrl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.w.aqrl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.w.aqrl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.w.aqrl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.w.aqrl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.w.aqrl	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d.aq	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d.aq	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d.aq	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d.aq	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d.aq	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d.aq	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d.aq	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d.aq	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d.aq	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d.rl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d.rl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d.rl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d.rl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d.rl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d.rl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d.rl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d.rl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d.rl	s5, s4, (s3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoswap.d.aqrl	a4, ra, (s0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoadd.d.aqrl	a1, a2, (a3)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoxor.d.aqrl	a2, a3, (a4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoand.d.aqrl	a3, a4, (a5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amoor.d.aqrl	a4, a5, (a6)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomin.d.aqrl	a5, a6, (a7)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomax.d.aqrl	s7, s6, (s5)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amominu.d.aqrl	s6, s5, (s4)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     amomaxu.d.aqrl	s5, s4, (s3)
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
index b86fcbccbeabb..78f4e7f50c745 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s
@@ -135,6 +135,9 @@ fclass.d a3, ft10
 # CHECK-NEXT: [2]   - SMX60_IEUA:1
 # CHECK-NEXT: [3]   - SMX60_IEUB:1
 # CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -240,95 +243,98 @@ fclass.d a3, ft10
 # CHECK-NEXT: [2]   - SMX60_IEUB
 # CHECK-NEXT: [3.0] - SMX60_LS
 # CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]
-# CHECK-NEXT: 149.00 11.00  11.00  3.00   3.00
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT: 149.00 11.00  11.00  3.00   3.00    -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  Instructions:
-# CHECK-NEXT:  -      -      -     0.50   0.50   flh	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fsh	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   flw	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fsw	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fld	ft0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   fsd	ft0, 0(a0)
-# CHECK-NEXT: 1.00    -      -      -      -     fadd.h	fs10, fs11, ft8
-# CHECK-NEXT: 1.00    -      -      -      -     fsub.h	ft9, ft10, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmul.h	ft0, ft1, ft2
-# CHECK-NEXT: 12.00   -      -      -      -     fdiv.h	ft3, ft4, ft5
-# CHECK-NEXT: 12.00   -      -      -      -     fsqrt.h	ft6, ft7
-# CHECK-NEXT: 1.00    -      -      -      -     fmin.h	fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fmax.h	fs2, fs3, fs4
-# CHECK-NEXT: 1.00    -      -      -      -     fmadd.h	fa0, fa1, fa2, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmsub.h	fa4, fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fnmsub.h	fs2, fs3, fs4, fs5
-# CHECK-NEXT: 1.00    -      -      -      -     fnmadd.h	fs6, fs7, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     fadd.s	fs10, fs11, ft8
-# CHECK-NEXT: 1.00    -      -      -      -     fsub.s	ft9, ft10, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmul.s	ft0, ft1, ft2
-# CHECK-NEXT: 15.00   -      -      -      -     fdiv.s	ft3, ft4, ft5
-# CHECK-NEXT: 15.00   -      -      -      -     fsqrt.s	ft6, ft7
-# CHECK-NEXT: 1.00    -      -      -      -     fmin.s	fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fmax.s	fs2, fs3, fs4
-# CHECK-NEXT: 1.00    -      -      -      -     fmadd.s	fa0, fa1, fa2, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmsub.s	fa4, fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fnmsub.s	fs2, fs3, fs4, fs5
-# CHECK-NEXT: 1.00    -      -      -      -     fnmadd.s	fs6, fs7, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     fadd.d	fs10, fs11, ft8
-# CHECK-NEXT: 1.00    -      -      -      -     fsub.d	ft9, ft10, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmul.d	ft0, ft1, ft2
-# CHECK-NEXT: 22.00   -      -      -      -     fdiv.d	ft3, ft4, ft5
-# CHECK-NEXT: 22.00   -      -      -      -     fsqrt.d	ft6, ft7
-# CHECK-NEXT: 1.00    -      -      -      -     fmin.d	fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fmax.d	fs2, fs3, fs4
-# CHECK-NEXT: 1.00    -      -      -      -     fmadd.d	fa0, fa1, fa2, ft11
-# CHECK-NEXT: 1.00    -      -      -      -     fmsub.d	fa4, fa5, fa6, fa7
-# CHECK-NEXT: 1.00    -      -      -      -     fnmsub.d	fs2, fs3, fs4, fs5
-# CHECK-NEXT: 1.00    -      -      -      -     fnmadd.d	fs6, fs7, fs8, fs9
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.x.h	a2, fs7
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.h.x	ft1, a6
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.s.h	fa0, ft0
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.s.h	fa0, ft0, rup
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.h.s	ft2, fa2
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.d.h	fa0, ft0
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.d.h	fa0, ft0, rup
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.h.d	ft2, fa2
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.w.s	a0, fs5
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.wu.s	a1, fs6
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.w	ft11, a4
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.wu	ft0, a5
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.l.s	a0, ft0
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.lu.s	a1, ft1
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.l	ft2, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.s.lu	ft3, a3
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.x.w	a2, fs7
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.w.x	ft1, a6
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnj.s	fs1, fa0, fa1
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnjn.s	fa1, fa3, fa4
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.wu.d	a4, ft11
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.w.d	a4, ft11
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.w	ft0, a5
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.wu	ft1, a6
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.s.d	fs5, fs6
-# CHECK-NEXT: 1.00    -      -      -      -     fcvt.d.s	fs7, fs8
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.l.d	a0, ft0
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.lu.d	a1, ft1
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.l	ft3, a3
-# CHECK-NEXT:  -     0.50   0.50    -      -     fcvt.d.lu	ft4, a4
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.x.d	a2, ft2
-# CHECK-NEXT:  -     0.50   0.50    -      -     fmv.d.x	ft5, a5
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnj.d	fs1, fa0, fa1
-# CHECK-NEXT: 1.00    -      -      -      -     fsgnjn.d	fa1, fa3, fa4
-# CHECK-NEXT: 1.00    -      -      -      -     feq.h	a1, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     flt.h	a2, fs10, fs11
-# CHECK-NEXT: 1.00    -      -      -      -     fle.h	a3, ft8, ft9
-# CHECK-NEXT: 1.00    -      -      -      -     feq.s	a1, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     flt.s	a2, fs10, fs11
-# CHECK-NEXT: 1.00    -      -      -      -     fle.s	a3, ft8, ft9
-# CHECK-NEXT: 1.00    -      -      -      -     feq.d	a1, fs8, fs9
-# CHECK-NEXT: 1.00    -      -      -      -     flt.d	a2, fs10, fs11
-# CHECK-NEXT: 1.00    -      -      -      -     fle.d	a3, ft8, ft9
-# CHECK-NEXT: 1.00    -      -      -      -     fclass.s	a3, ft10
-# CHECK-NEXT: 1.00    -      -      -      -     fclass.s	a3, ft10
-# CHECK-NEXT: 1.00    -      -      -      -     fclass.d	a3, ft10
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     flh	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fsh	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     flw	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fsw	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fld	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     fsd	ft0, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fadd.h	fs10, fs11, ft8
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsub.h	ft9, ft10, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmul.h	ft0, ft1, ft2
+# CHECK-NEXT: 12.00   -      -      -      -      -      -      -     fdiv.h	ft3, ft4, ft5
+# CHECK-NEXT: 12.00   -      -      -      -      -      -      -     fsqrt.h	ft6, ft7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmin.h	fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmax.h	fs2, fs3, fs4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmadd.h	fa0, fa1, fa2, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmsub.h	fa4, fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmsub.h	fs2, fs3, fs4, fs5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmadd.h	fs6, fs7, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fadd.s	fs10, fs11, ft8
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsub.s	ft9, ft10, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmul.s	ft0, ft1, ft2
+# CHECK-NEXT: 15.00   -      -      -      -      -      -      -     fdiv.s	ft3, ft4, ft5
+# CHECK-NEXT: 15.00   -      -      -      -      -      -      -     fsqrt.s	ft6, ft7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmin.s	fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmax.s	fs2, fs3, fs4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmadd.s	fa0, fa1, fa2, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmsub.s	fa4, fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmsub.s	fs2, fs3, fs4, fs5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmadd.s	fs6, fs7, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fadd.d	fs10, fs11, ft8
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsub.d	ft9, ft10, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmul.d	ft0, ft1, ft2
+# CHECK-NEXT: 22.00   -      -      -      -      -      -      -     fdiv.d	ft3, ft4, ft5
+# CHECK-NEXT: 22.00   -      -      -      -      -      -      -     fsqrt.d	ft6, ft7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmin.d	fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmax.d	fs2, fs3, fs4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmadd.d	fa0, fa1, fa2, ft11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fmsub.d	fa4, fa5, fa6, fa7
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmsub.d	fs2, fs3, fs4, fs5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fnmadd.d	fs6, fs7, fs8, fs9
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.x.h	a2, fs7
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.h.x	ft1, a6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.s.h	fa0, ft0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.s.h	fa0, ft0, rup
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.h.s	ft2, fa2
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.d.h	fa0, ft0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.d.h	fa0, ft0, rup
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.h.d	ft2, fa2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.w.s	a0, fs5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.wu.s	a1, fs6
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.w	ft11, a4
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.wu	ft0, a5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.l.s	a0, ft0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.lu.s	a1, ft1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.l	ft2, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.s.lu	ft3, a3
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.x.w	a2, fs7
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.w.x	ft1, a6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnj.s	fs1, fa0, fa1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnjn.s	fa1, fa3, fa4
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.wu.d	a4, ft11
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.w.d	a4, ft11
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.w	ft0, a5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.wu	ft1, a6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.s.d	fs5, fs6
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fcvt.d.s	fs7, fs8
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.l.d	a0, ft0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.lu.d	a1, ft1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.l	ft3, a3
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fcvt.d.lu	ft4, a4
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.x.d	a2, ft2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     fmv.d.x	ft5, a5
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnj.d	fs1, fa0, fa1
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fsgnjn.d	fa1, fa3, fa4
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     feq.h	a1, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     flt.h	a2, fs10, fs11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fle.h	a3, ft8, ft9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     feq.s	a1, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     flt.s	a2, fs10, fs11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fle.s	a3, ft8, ft9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     feq.d	a1, fs8, fs9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     flt.d	a2, fs10, fs11
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fle.d	a3, ft8, ft9
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fclass.s	a3, ft10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fclass.s	a3, ft10
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     fclass.d	a3, ft10
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
index b72540f29f487..51a036aaae784 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s
@@ -170,6 +170,9 @@ bseti a0, a1, 1
 # CHECK-NEXT: [2]   - SMX60_IEUA:1
 # CHECK-NEXT: [3]   - SMX60_IEUB:1
 # CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -306,126 +309,129 @@ bseti a0, a1, 1
 # CHECK-NEXT: [2]   - SMX60_IEUB
 # CHECK-NEXT: [3.0] - SMX60_LS
 # CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]
-# CHECK-NEXT:  -     180.50 44.50  5.50   5.50
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     180.50 44.50  5.50   5.50    -      -      -
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  Instructions:
-# CHECK-NEXT:  -     0.50   0.50    -      -     addi	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     addiw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     slti	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     seqz	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     andi	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     ori	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     xori	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     slli	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     srli	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     srai	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     slliw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     srliw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     sraiw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     lui	a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     auipc	a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     add	a0, a0, a1
-# CHECK-NEXT:  -     0.50   0.50    -      -     addw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     slt	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sltu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     and	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     or	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     xor	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sll	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     srl	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sra	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sllw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     srlw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sraw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sub	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     subw	a0, a0, a0
-# CHECK-NEXT:  -     1.00    -      -      -     jal	a0, .Ltmp0
-# CHECK-NEXT:  -     1.00    -      -      -     jalr	a0
-# CHECK-NEXT:  -     1.00    -      -      -     beq	a0, a0, .Ltmp1
-# CHECK-NEXT:  -     1.00    -      -      -     bne	a0, a0, .Ltmp2
-# CHECK-NEXT:  -     1.00    -      -      -     blt	a0, a0, .Ltmp3
-# CHECK-NEXT:  -     1.00    -      -      -     bltu	a0, a0, .Ltmp4
-# CHECK-NEXT:  -     1.00    -      -      -     bge	a0, a0, .Ltmp5
-# CHECK-NEXT:  -     1.00    -      -      -     bgeu	a0, a0, .Ltmp6
-# CHECK-NEXT:  -     0.50   0.50    -      -     add	a0, a0, a0
-# CHECK-NEXT:  -      -      -     0.50   0.50   lb	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lbu	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lh	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lhu	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lw	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   lwu	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   ld	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sb	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sh	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sw	t0, 0(a0)
-# CHECK-NEXT:  -      -      -     0.50   0.50   sd	t0, 0(a0)
-# CHECK-NEXT:  -     0.50   0.50    -      -     mul	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulh	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulhu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulhsu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     mulw	a0, a0, a0
-# CHECK-NEXT:  -     20.00   -      -      -     div	a0, a1, a2
-# CHECK-NEXT:  -     20.00   -      -      -     divu	a0, a1, a2
-# CHECK-NEXT:  -     20.00   -      -      -     rem	a0, a1, a2
-# CHECK-NEXT:  -     20.00   -      -      -     remu	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     divw	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     divuw	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     remw	a0, a1, a2
-# CHECK-NEXT:  -     12.00   -      -      -     remuw	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrw	t0, 4095, t1
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrs	s3, fflags, s5
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrc	sp, 0, ra
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrwi	a5, 0, 0
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrsi	t2, 4095, 31
-# CHECK-NEXT:  -     0.50   0.50    -      -     csrrci	t1, sscratch, 5
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.eqz	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.nez	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.eqz	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     czero.nez	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     slli.uw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh1add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh2add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh3add.uw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh1add	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh2add	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sh3add	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     andn	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     orn	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     xnor	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clz	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clzw	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     ctz	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     ctzw	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     cpop	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     cpopw	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     min	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     minu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     max	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     maxu	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sext.b	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     sext.h	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     zext.h	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rol	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rolw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     ror	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rorw	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rori	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     roriw	a0, a0, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     orc.b	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     rev8	a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clmul	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clmulr	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     clmulh	a0, a0, a0
-# CHECK-NEXT:  -     0.50   0.50    -      -     bclr	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     bclri	a0, a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     bext	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     bexti	a0, a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     binv	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     binvi	a0, a1, 1
-# CHECK-NEXT:  -     0.50   0.50    -      -     bset	a0, a1, a2
-# CHECK-NEXT:  -     0.50   0.50    -      -     bseti	a0, a1, 1
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addi	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addiw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slti	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     seqz	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     andi	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ori	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     xori	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slli	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srli	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srai	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slliw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srliw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sraiw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     lui	a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     auipc	a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     add	a0, a0, a1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     addw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slt	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sltu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     and	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     or	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     xor	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sll	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srl	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sra	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sllw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     srlw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sraw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sub	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     subw	a0, a0, a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     jal	a0, .Ltmp0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     jalr	a0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     beq	a0, a0, .Ltmp1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bne	a0, a0, .Ltmp2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     blt	a0, a0, .Ltmp3
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bltu	a0, a0, .Ltmp4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bge	a0, a0, .Ltmp5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     bgeu	a0, a0, .Ltmp6
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     add	a0, a0, a0
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lb	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lbu	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lh	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lhu	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lw	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     lwu	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     ld	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sb	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sh	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sw	t0, 0(a0)
+# CHECK-NEXT:  -      -      -     0.50   0.50    -      -      -     sd	t0, 0(a0)
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mul	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulh	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulhu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulhsu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     mulw	a0, a0, a0
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     div	a0, a1, a2
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     divu	a0, a1, a2
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     rem	a0, a1, a2
+# CHECK-NEXT:  -     20.00   -      -      -      -      -      -     remu	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     divw	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     divuw	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     remw	a0, a1, a2
+# CHECK-NEXT:  -     12.00   -      -      -      -      -      -     remuw	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrw	t0, 4095, t1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrs	s3, fflags, s5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrc	sp, 0, ra
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrwi	a5, 0, 0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrsi	t2, 4095, 31
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     csrrci	t1, sscratch, 5
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.eqz	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.nez	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.eqz	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     czero.nez	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     slli.uw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh1add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh2add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh3add.uw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh1add	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh2add	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sh3add	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     andn	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     orn	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     xnor	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clz	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clzw	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ctz	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ctzw	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     cpop	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     cpopw	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     min	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     minu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     max	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     maxu	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sext.b	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     sext.h	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     zext.h	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rol	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rolw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     ror	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rorw	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rori	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     roriw	a0, a0, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     orc.b	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     rev8	a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clmul	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clmulr	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     clmulh	a0, a0, a0
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bclr	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bclri	a0, a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bext	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bexti	a0, a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     binv	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     binvi	a0, a1, 1
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bset	a0, a1, a2
+# CHECK-NEXT:  -     0.50   0.50    -      -      -      -      -     bseti	a0, a1, 1
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
new file mode 100644
index 0000000000000..c7755dcc37658
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s
@@ -0,0 +1,6820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Basic arithmetic operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vadd.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vadd.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vadc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vadc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vadc.vim v8, v8, 12, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vsbc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vsbc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vaaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vaaddu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vaaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vaaddu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vaadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vaadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vaadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vaadd.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vasubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vasubu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vasubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vasubu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vasub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vasub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vasub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vasub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vim v8, v8, 12, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmadc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmadc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbc.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vrsub.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vrsub.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vrsub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vrsub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsaddu.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsaddu.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsaddu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsaddu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsaddu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsaddu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsadd.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsadd.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsadd.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsadd.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssubu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssubu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssubu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssubu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssub.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssub.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwaddu.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwaddu.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwadd.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwadd.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsubu.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsubu.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwsub.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwsub.wx v8, v16, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VI                    vadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VV                    vadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADD_VX                    vadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VV                    vsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSUB_VX                    vsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VVM                   vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VXM                   vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VADC_VIM                   vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VVM                   vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSBC_VXM                   vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VV                  vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_VX                  vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VV                   vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_VX                   vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VV                  vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_VX                  vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VV                   vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_VX                   vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VV                  vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADDU_VX                  vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VV                   vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAADD_VX                   vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VV                  vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUBU_VX                  vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VV                   vasub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VASUB_VX                   vasub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VI                   vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VIM                  vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VV                   vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VVM                  vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VX                   vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADC_VXM                  vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VV                   vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VVM                  vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VX                   vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBC_VXM                  vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VI                   vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRSUB_VX                   vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VI                  vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VV                  vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADDU_VX                  vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VI                   vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VV                   vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSADD_VX                   vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VV                  vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUBU_VX                  vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VV                   vssub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSUB_VX                   vssub.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WV                  vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADDU_WX                  vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WV                   vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWADD_WX                   vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WV                  vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUBU_WX                  vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WV                   vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWSUB_WX                   vwsub.wx	v8, v16, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     1120.00  -     -      -      -     1120.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vaadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vasub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbc.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrsub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsaddu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsadd.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssubu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssub.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwaddu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwadd.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsubu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwsub.wx	v8, v16, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
new file mode 100644
index 0000000000000..0b5dd607c7196
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s
@@ -0,0 +1,4328 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Bitwise and logical operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vand.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vand.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vand.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vand.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vand.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vand.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vor.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vor.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vor.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vor.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vor.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vor.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vxor.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vxor.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vxor.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vxor.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vxor.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vxor.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnsra.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnsra.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnsra.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnsra.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnsra.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnsra.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnsrl.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnsrl.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnsrl.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnsrl.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnsrl.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnsrl.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnclipu.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnclipu.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnclipu.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnclipu.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnclipu.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnclipu.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vnclip.wi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vnclip.wi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vnclip.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vnclip.wv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vnclip.wx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vnclip.wx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsll.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsll.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsll.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsll.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsll.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsll.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsra.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsra.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsra.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsra.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsra.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsra.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vsrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vsrl.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsrl.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsrl.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vssra.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vssra.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssra.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssra.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssra.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssra.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vssrl.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vssrl.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vssrl.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vssrl.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vssrl.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vssrl.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VV                    vand.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VX                    vand.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VAND_VI                    vand.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VV                     vor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VX                     vor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VOR_VI                     vor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VV                    vxor.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VX                    vxor.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VXOR_VI                    vxor.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WV                   vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WX                   vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRA_WI                   vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WV                   vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WX                   vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNSRL_WI                   vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WI                 vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WV                 vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIPU_WX                 vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WI                  vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WV                  vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNCLIP_WX                  vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VI                    vsll.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VV                    vsll.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLL_VX                    vsll.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VI                    vsra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VV                    vsra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRA_VX                    vsra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VI                    vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VV                    vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSRL_VX                    vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VI                   vssra.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VV                   vssra.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRA_VX                   vssra.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VI                   vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VV                   vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSSRL_VX                   vssrl.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     708.00  -      -      -      -     708.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vand.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vxor.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsra.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnsrl.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclipu.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnclip.wx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsll.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssra.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vssrl.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
new file mode 100644
index 0000000000000..e381b45642628
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s
@@ -0,0 +1,2704 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Comparison operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmseq.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmseq.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmseq.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmseq.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmseq.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmseq.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsle.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsle.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsle.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsle.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsle.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsle.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsleu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsleu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsleu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsleu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsleu.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsleu.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsne.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsne.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsne.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsne.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsne.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsne.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgtu.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgtu.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgtu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgtu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgt.vi v8, v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgt.vi v8, v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsgt.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsgt.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmsltu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmsltu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmsltu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmsltu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmslt.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmslt.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmslt.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmslt.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VV                   vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VX                   vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSEQ_VI                   vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VV                   vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VX                   vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLE_VI                   vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VV                  vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VX                  vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLEU_VI                  vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VV                   vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VX                   vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSNE_VI                   vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VI                  vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGTU_VX                  vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VI                   vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSGT_VX                   vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VV                  vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLTU_VX                  vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VV                   vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSLT_VX                   vmslt.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     440.00  -      -      -      -     440.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmseq.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsle.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsleu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsne.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgtu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vi	v8, v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsgt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsltu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmslt.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
new file mode 100644
index 0000000000000..ca6e9d15332af
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s
@@ -0,0 +1,1757 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Conversion operations
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vsext.vf2 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vsext.vf2 v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vzext.vf2 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vzext.vf2 v8, v16
+
+vsetvli x28, x0, e32, mf2, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vsext.vf4 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vsext.vf4 v8, v16
+
+vsetvli x28, x0, e32, mf2, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vzext.vf4 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vzext.vf4 v8, v16
+
+vsetvli x28, x0, e64, m1, tu, mu
+vsext.vf8 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vsext.vf8 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vsext.vf8 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vsext.vf8 v8, v16
+
+vsetvli x28, x0, e64, m1, tu, mu
+vzext.vf8 v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vzext.vf8 v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vzext.vf8 v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vzext.vf8 v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.f.xu.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.f.xu.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.f.x.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.f.x.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.rtz.x.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.rtz.xu.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.x.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.x.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfcvt.xu.f.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfcvt.xu.f.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.f.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.f.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.f.xu.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.f.xu.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.f.x.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.f.x.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.rod.f.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.rod.f.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.rtz.x.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.rtz.xu.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.x.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.x.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfncvt.xu.f.w v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfncvt.xu.f.w v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.f.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.f.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.f.x.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.f.x.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.f.xu.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.f.xu.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.rtz.x.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.rtz.xu.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.x.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.x.f.v v8, v16
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vfwcvt.xu.f.v v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vfwcvt.xu.f.v v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF2                  vsext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF2                  vzext.vf2	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF4                  vsext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF4                  vzext.vf4	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSEXT_VF8                  vsext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VZEXT_VF8                  vzext.vf8	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_XU_V               vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_F_X_V                vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_X_F_V            vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_RTZ_XU_F_V           vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_X_F_V                vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCVT_XU_F_V               vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_F_W               vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_XU_W              vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_F_X_W               vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_ROD_F_F_W           vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_X_F_W           vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_RTZ_XU_F_W          vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_X_F_W               vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNCVT_XU_F_W              vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_F_V               vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_X_V               vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_F_XU_V              vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_X_F_V           vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_RTZ_XU_F_V          vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_X_F_V               vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWCVT_XU_F_V              vfwcvt.xu.f.v	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     281.00  -      -      -     225.00 56.00   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf2	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf4	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vzext.vf8	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.xu.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.f.x.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.rtz.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.x.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfcvt.xu.f.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.xu.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.f.x.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rod.f.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.rtz.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.x.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfncvt.xu.f.w	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.x.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.rtz.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.x.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwcvt.xu.f.v	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
new file mode 100644
index 0000000000000..a3105c39ff978
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s
@@ -0,0 +1,2185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Fused multiply-add operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmacc.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmacc.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmacc.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmadd.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmadd.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmadd.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsac.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsac.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsac.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsub.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vnmsub.vx v8, x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vnmsub.vx v8, x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccu.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccu.vx v8, x16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmacc.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmacc.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmacc.vx v8, x16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccsu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccsu.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccsu.vx v8, x16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmaccus.vx v8, x16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmaccus.vx v8, x16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmacc.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmacc.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmsac.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmsac.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmacc.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmacc.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmacc.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmacc.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmsac.vf v8, f16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmsac.vf v8, f16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwnmsac.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwnmsac.vv v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VV                   vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMACC_VX                   vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VV                   vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMADD_VX                   vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VV                  vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSAC_VX                  vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VV                  vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VNMSUB_VX                  vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VV                 vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCU_VX                 vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VV                  vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACC_VX                  vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VV                vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCSU_VX                vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMACCUS_VX                vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VF                 vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMACC_VV                 vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VF                 vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMSAC_VV                 vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VF                vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMACC_VV                vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VF                vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWNMSAC_VV                vfwnmsac.vv	v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     353.00  -      -      -     72.00  281.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmacc.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmadd.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsac.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vnmsub.vx	v8, s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmacc.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccsu.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmaccus.vx	v8, a6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmacc.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vf	v8, fa6, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwnmsac.vv	v8, v16, v24
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
new file mode 100644
index 0000000000000..f59c7987b615b
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s
@@ -0,0 +1,5599 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Floating point operations
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfeq.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfeq.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfeq.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfeq.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfge.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfge.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfge.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfge.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfgt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfgt.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfgt.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfgt.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfle.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfle.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfle.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfle.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmflt.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmflt.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmflt.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmflt.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vmfne.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vmfne.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmfne.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmfne.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfadd.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfadd.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfadd.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsub.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsub.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfclass.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfclass.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfdiv.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfdiv.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmax.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmax.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmax.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmin.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmin.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmin.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsac.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsac.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsub.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmsub.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmul.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmul.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmul.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmacc.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmacc.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmadd.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmadd.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmv.f.s f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmv.f.s f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmv.s.f v8, f8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmv.s.f v8, f8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m1, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e16, m8, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m1, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e32, m8, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m1, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m2, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m4, tu, mu
+vfmv.v.f v8, f8
+vsetvli x28, x0, e64, m8, tu, mu
+vfmv.v.f v8, f8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmacc.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmacc.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmacc.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmacc.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmadd.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmadd.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmadd.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmadd.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsac.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsac.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsac.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsac.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsub.vf v8, f8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsub.vf v8, f8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfnmsub.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfnmsub.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfrdiv.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfrdiv.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfrec7.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfrec7.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfrsqrt7.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfrsqrt7.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfrsub.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfrsub.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsqrt.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsqrt.v v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjn.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjn.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjn.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjn.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnj.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnj.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnj.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnj.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjx.vf v8, v8, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjx.vf v8, v8, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfsgnjx.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfsgnjx.vv v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.wf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.wf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwadd.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwadd.wv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmul.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmul.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwmul.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.wf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.wf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vfwsub.wv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vfwsub.wv v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VF                   vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFEQ_VV                   vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGE_VF                   vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFGT_VF                   vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VF                   vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLE_VV                   vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VF                   vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFLT_VV                   vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VF                   vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VMFNE_VV                   vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VV                   vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFADD_VF                   vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VV                   vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSUB_VF                   vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFCLASS_V                  vfclass.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VV                   vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFDIV_VF                   vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VV                   vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMAX_VF                   vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VV                   vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMIN_VF                   vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VV                  vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSAC_VF                  vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VV                  vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMSUB_VF                  vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VV                   vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMUL_VF                   vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VF                  vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMACC_VV                  vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VF                  vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMADD_VV                  vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_F_S                   vfmv.f.s	fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFMV_S_F                   vfmv.s.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMV_V_F                   vfmv.v.f	v8, fs0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VF                 vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMACC_VV                 vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VF                 vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMADD_VV                 vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VF                 vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSAC_VV                 vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VF                 vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFNMSUB_VV                 vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRDIV_VF                  vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREC7_V                   vfrec7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSQRT7_V                 vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFRSUB_VF                  vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSQRT_V                   vfsqrt.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VF                 vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJN_VV                 vfneg.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VF                  vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJ_VV                  vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VF                 vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSGNJX_VV                 vfabs.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VF                  vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_VV                  vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WF                  vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWADD_WV                  vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VF                  vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWMUL_VV                  vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VF                  vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_VV                  vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WF                  vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWSUB_WV                  vfwsub.wv	v8, v16, v24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     915.00  -      -      -     885.00 30.00   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfeq.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfge.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfgt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfle.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmflt.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vmfne.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfadd.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfclass.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmax.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmin.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmul.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.f.s	fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfmv.s.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmv.v.f	v8, fs0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmacc.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmadd.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsac.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vf	v8, fs0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfnmsub.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrdiv.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrec7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsqrt7.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfrsub.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsqrt.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjn.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfneg.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnj.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfsgnjx.vf	v8, v8, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfabs.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwadd.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwsub.wv	v8, v16, v24
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s
new file mode 100644
index 0000000000000..ce1ade0f143af
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s
@@ -0,0 +1,1864 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Mask operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmand.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmand.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmnand.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmnand.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmandn.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmandn.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmxor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmxor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmnor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmorn.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmorn.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmxnor.mm v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmxnor.mm v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vmsbf.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vmsbf.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vmsif.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vmsif.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+vmsof.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+vmsof.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e8, m8, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e16, m8, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e32, m8, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m1, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m2, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m4, tu, mu
+vid.v v8
+vsetvli x28, x0, e64, m8, tu, mu
+vid.v v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vcpop.m x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vcpop.m x8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfirst.m x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfirst.m x8, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAND_MM                   vmmv.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNAND_MM                  vmnot.m	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMANDN_MM                  vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXOR_MM                   vmclr.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMOR_MM                    vmor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMNOR_MM                   vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMORN_MM                   vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMXNOR_MM                  vmset.m	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSBF_M                    vmsbf.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSIF_M                    vmsif.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMSOF_M                    vmsof.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VID_V                      vid.v	v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCPOP_M                    vcpop.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VFIRST_M                   vfirst.m	s0, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     301.00  -      -      -      -     301.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmmv.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnot.m	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmandn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmclr.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmnor.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmorn.mm	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmset.m	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsbf.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsif.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmsof.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vid.v	v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcpop.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vfirst.m	s0, v8
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
new file mode 100644
index 0000000000000..4cc496b7de72e
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s
@@ -0,0 +1,1108 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Min/max operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmax.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmax.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmax.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmax.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmaxu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmaxu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmaxu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmaxu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmin.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmin.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmin.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmin.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vminu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vminu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vminu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vminu.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VV                    vmax.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAX_VX                    vmax.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VV                   vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMAXU_VX                   vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VV                    vmin.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMIN_VX                    vmin.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VV                   vminu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMINU_VX                   vminu.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     176.00  -      -      -      -     176.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmax.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmaxu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmin.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vminu.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
new file mode 100644
index 0000000000000..5faf2628105f7
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s
@@ -0,0 +1,2984 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Multiplication and division operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmul.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmul.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vdiv.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vdiv.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vdiv.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vdiv.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vdivu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vdivu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vdivu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vdivu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vrem.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vrem.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vrem.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vrem.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vremu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vremu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vremu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vremu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmulh.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmulh.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmulh.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmulh.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhsu.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhsu.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vmulhsu.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vmulhsu.vx v8, v8, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmul.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmul.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwmul.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwmul.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulsu.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulsu.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vwmulsu.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vwmulsu.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vsmul.vv v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vsmul.vv v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vsmul.vx v8, v8, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vsmul.vx v8, v8, x30
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VV                    vmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMUL_VX                    vmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VV                    vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIV_VX                    vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VV                   vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VDIVU_VX                   vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VV                    vrem.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREM_VX                    vrem.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VV                   vremu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREMU_VX                   vremu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VV                   vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULH_VX                   vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VV                  vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHU_VX                  vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VV                 vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMULHSU_VX                 vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VV                   vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMUL_VX                   vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VV                  vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULU_VX                  vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VV                 vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWMULSU_VX                 vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VV                   vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSMUL_VX                   vsmul.vx	v8, v8, t5
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     486.00  -      -      -      -     486.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdiv.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vdivu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrem.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vremu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulh.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmulhsu.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmul.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwmulsu.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vv	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vsmul.vx	v8, v8, t5
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
new file mode 100644
index 0000000000000..fa53c08995c29
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s
@@ -0,0 +1,3504 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Permutation and shuffle operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.v.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.v.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.v.x v8, x8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.v.x v8, x8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.v.i v8, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.v.i v8, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.x.s x8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.x.s x8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv.s.x v8, x8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv.s.x v8, x8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv1r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv1r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv2r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv2r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv4r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv4r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vmv8r.v v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vmv8r.v v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, mf4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, mf8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e8, m8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, mf2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, mf4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e16, m8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, mf2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e32, m8, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m1, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m2, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m4, tu, mu
+viota.m v8, v16
+vsetvli x28, x0, e64, m8, tu, mu
+viota.m v8, v16
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m1, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m2, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m4, tu, mu
+vcompress.vm v8, v16, v24
+vsetvli x28, x0, e64, m8, tu, mu
+vcompress.vm v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslide1up.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslide1up.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslide1down.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslide1down.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslideup.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslideup.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vslideup.vi v8, v16, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vslideup.vi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vslidedown.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vslidedown.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vslidedown.vi v8, v16, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vslidedown.vi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m1, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m2, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m4, tu, mu
+vrgather.vv v8, v16, v24
+vsetvli x28, x0, e64, m8, tu, mu
+vrgather.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e8, m8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e16, m8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e32, m8, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m1, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m2, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m4, tu, mu
+vrgather.vx v8, v16, x30
+vsetvli x28, x0, e64, m8, tu, mu
+vrgather.vx v8, v16, x30
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e8, m8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e16, m8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e32, m8, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m1, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m2, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m4, tu, mu
+vrgather.vi v8, v16, 12
+vsetvli x28, x0, e64, m8, tu, mu
+vrgather.vi v8, v16, 12
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m1, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m2, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m4, tu, mu
+vrgatherei16.vv v8, v16, v24
+vsetvli x28, x0, e64, m8, tu, mu
+vrgatherei16.vv v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmerge.vim v8, v8, 12, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmerge.vim v8, v8, 12, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmerge.vvm v8, v8, v8, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmerge.vvm v8, v8, v8, v0
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, mf8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e8, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vmerge.vxm v8, v8, x30, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vmerge.vxm v8, v8, x30, v0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m1, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e16, m8, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m1, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e32, m8, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m1, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m2, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m4, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+vsetvli x28, x0, e64, m8, tu, mu
+vfmerge.vfm v8, v8, ft0, v0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfslide1down.vf v8, v16, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfslide1down.vf v8, v16, ft0
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, mf4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m1, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e16, m8, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, mf2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m1, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e32, m8, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m1, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m2, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m4, tu, mu
+vfslide1up.vf v8, v16, ft0
+vsetvli x28, x0, e64, m8, tu, mu
+vfslide1up.vf v8, v16, ft0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_V                    vmv.v.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_X                    vmv.v.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_V_I                    vmv.v.i	v8, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_X_S                    vmv.x.s	s0, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV_S_X                    vmv.s.x	v8, s0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV1R_V                    vmv1r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV2R_V                    vmv2r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV4R_V                    vmv4r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMV8R_V                    vmv8r.v	v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VIOTA_M                    viota.m	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VCOMPRESS_VM               vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1UP_VX               vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDE1DOWN_VX             vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VX                vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEUP_VI                vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VX              vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VSLIDEDOWN_VI              vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VV                vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VX                vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHER_VI                vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VRGATHEREI16_VV            vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VIM                 vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VVM                 vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VMERGE_VXM                 vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFMERGE_VFM                vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1DOWN_VF            vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFSLIDE1UP_VF              vfslide1up.vf	v8, v16, ft0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     572.00  -      -      -     45.00  527.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.v.i	v8, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.x.s	s0, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv.s.x	v8, s0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv1r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv2r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv4r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmv8r.v	v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     viota.m	v8, v16
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vcompress.vm	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1up.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslide1down.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslideup.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vslidedown.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vx	v8, v16, t5
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgather.vi	v8, v16, 12
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vrgatherei16.vv	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vim	v8, v8, 12, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vvm	v8, v8, v8, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vmerge.vxm	v8, v8, t5, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfmerge.vfm	v8, v8, ft0, v0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1down.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfslide1up.vf	v8, v16, ft0
diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
new file mode 100644
index 0000000000000..3d7a67d8ba161
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s
@@ -0,0 +1,1824 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s
+
+# Reduction operations
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredand.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredand.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredmaxu.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredmaxu.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredmax.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredminu.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredminu.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredmin.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredor.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredor.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredsum.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredsum.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, mf8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e8, m8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vredxor.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vredxor.vs v8, v8, v8
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwredsumu.vs v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vwredsumu.vs v8, v16, v24
+
+vsetvli x28, x0, e8, mf2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, mf4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, mf8, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m1, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e8, m8, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, mf2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, mf4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m1, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e16, m8, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, mf2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m1, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m2, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m4, tu, mu
+vwredsum.vs v8, v16, v24
+vsetvli x28, x0, e32, m8, tu, mu
+vwredsum.vs v8, v16, v24
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredmax.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredmax.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredmin.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredmin.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredosum.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredosum.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m1, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m2, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m4, tu, mu
+vfredusum.vs v8, v8, v8
+vsetvli x28, x0, e64, m8, tu, mu
+vfredusum.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfwredosum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfwredosum.vs v8, v8, v8
+
+vsetvli x28, x0, e16, mf2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, mf4, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m1, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m4, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e16, m8, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, mf2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m1, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m2, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m4, tu, mu
+vfwredusum.vs v8, v8, v8
+vsetvli x28, x0, e32, m8, tu, mu
+vfwredusum.vs v8, v8, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP:1
+# CHECK-NEXT: [1]   - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB
+# CHECK-NEXT: [2]   - SMX60_IEUA:1
+# CHECK-NEXT: [3]   - SMX60_IEUB:1
+# CHECK-NEXT: [4]   - SMX60_LS:2
+# CHECK-NEXT: [5]   - SMX60_VFP:1
+# CHECK-NEXT: [6]   - SMX60_VIEU:1
+# CHECK-NEXT: [7]   - SMX60_VLS:1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]                                        [9]                        Instructions:
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDAND_VS                 vredand.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAXU_VS                vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMAX_VS                 vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMINU_VS                vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDMIN_VS                 vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDOR_VS                  vredor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDSUM_VS                 vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VREDXOR_VS                 vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUMU_VS               vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VIEU                                 VWREDSUM_VS                vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMAX_VS                vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDMIN_VS                vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDOSUM_VS               vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFREDUSUM_VS               vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDOSUM_VS              vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  1      1     1.00                  U      1     SMX60_IEU,SMX60_IEUA                       VSETVLI                    vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      1     1.00                         1     SMX60_VFP                                  VFWREDUSUM_VS              vfwredusum.vs	v8, v8, v8
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SMX60_FP
+# CHECK-NEXT: [1]   - SMX60_IEUA
+# CHECK-NEXT: [2]   - SMX60_IEUB
+# CHECK-NEXT: [3.0] - SMX60_LS
+# CHECK-NEXT: [3.1] - SMX60_LS
+# CHECK-NEXT: [4]   - SMX60_VFP
+# CHECK-NEXT: [5]   - SMX60_VIEU
+# CHECK-NEXT: [6]   - SMX60_VLS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]
+# CHECK-NEXT:  -     294.00  -      -      -     82.00  212.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3.0]  [3.1]  [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredand.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmaxu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredminu.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredsum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vredxor.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsumu.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     vwredsum.vs	v8, v16, v24
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmax.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredmin.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredosum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     vsetvli	t3, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     vfwredusum.vs	v8, v8, v8
diff --git a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
index f88b7575002a9..3547b728a426d 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test
@@ -33,6 +33,12 @@
 # RUN: llvm-objcopy -I binary -O elf64-littleriscv %t.txt %t.rv64.o
 # RUN: llvm-readobj --file-headers %t.rv64.o | FileCheck %s --check-prefixes=CHECK,LE,RISCV64,64
 
+# RUN: llvm-objcopy -I binary -O elf32-bigriscv %t.txt %t.rv32.o
+# RUN: llvm-readobj --file-headers %t.rv32.o | FileCheck %s --check-prefixes=CHECK,BE,RISCV32,32
+
+# RUN: llvm-objcopy -I binary -O elf64-bigriscv %t.txt %t.rv64.o
+# RUN: llvm-readobj --file-headers %t.rv64.o | FileCheck %s --check-prefixes=CHECK,BE,RISCV64,64
+
 # RUN: llvm-objcopy -I binary -O elf32-sparc %t.txt %t.sparc.o
 # RUN: llvm-readobj --file-headers %t.sparc.o | FileCheck %s --check-prefixes=CHECK,BE,SPARC,32
 
diff --git a/llvm/test/tools/llvm-objcopy/ELF/strip-preserve-atime.test b/llvm/test/tools/llvm-objcopy/ELF/strip-preserve-atime.test
index b145599d9ff33..c08580f32ecfd 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/strip-preserve-atime.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/strip-preserve-atime.test
@@ -1,6 +1,8 @@
 # Note: ls -lu prints the accessed timestamp
 # NetBSD: noatime mounts currently inhibit 'touch -a' updates
 # Windows: the last access time is disabled by default in the OS
+# This test is also known to fail on Mac (and possibly elsewhere) when Crowdstrike is installed;
+# please make use of the LIT_XFAIL environment variable on such machines.
 # UNSUPPORTED: system-netbsd, system-windows
 
 # Preserve dates when stripping to an output file.
diff --git a/llvm/test/tools/llvm-objdump/ELF/RISCV/rv32-plt.test b/llvm/test/tools/llvm-objdump/ELF/RISCV/rv32-plt.test
new file mode 100644
index 0000000000000..2160dc77fdc09
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/RISCV/rv32-plt.test
@@ -0,0 +1,44 @@
+# RUN: yaml2obj %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+
+# CHECK:      00001470 <strcmp@plt>:
+# CHECK-NEXT:     1470:       auipc   t3, 0x2
+# CHECK-NEXT:     1474:       lw      t3, 0xfc(t3)
+# CHECK-NEXT:     1478:       jalr    t1, t3
+# CHECK-NEXT:     147c:       nop
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_RISCV
+  Flags:           [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x290
+    Link:            .dynsym
+    AddressAlign:    0x4
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x356C
+        Symbol:          strcmp
+        Type:            R_RISCV_JUMP_SLOT
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1450
+    AddressAlign:    0x10
+    Content:         972300003303C34103AE4311130343FD938243111353230083A2420067000E00172E0000032ECE0F67030E0013000000
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x3564
+    AddressAlign:    0x4
+    Content:         '000000000000000050140000'
+DynamicSymbols:
+  - Name:            strcmp
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+...
diff --git a/llvm/test/tools/llvm-objdump/ELF/RISCV/rv64-plt.test b/llvm/test/tools/llvm-objdump/ELF/RISCV/rv64-plt.test
new file mode 100644
index 0000000000000..a6ecbf4a0a33f
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/RISCV/rv64-plt.test
@@ -0,0 +1,44 @@
+# RUN: yaml2obj %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+
+# CHECK:      0000000000001630 <strcmp@plt>:
+# CHECK-NEXT:     1630:       auipc   t3, 0x2
+# CHECK-NEXT:     1634:       ld      t3, 0x1b8(t3)
+# CHECK-NEXT:     1638:       jalr    t1, t3
+# CHECK-NEXT:     163c:       nop
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_RISCV
+  Flags:           [ EF_RISCV_RVC, EF_RISCV_FLOAT_ABI_DOUBLE ]
+Sections:
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x408
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x37E8
+        Symbol:          strcmp
+        Type:            R_RISCV_JUMP_SLOT
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x1610
+    AddressAlign:    0x10
+    Content:         972300003303C34103BE831C130343FD9382831C1353130083B2820067000E00172E0000033E8E1B67030E0013000000
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x37D8
+    AddressAlign:    0x8
+    Content:         '000000000000000000000000000000001016000000000000'
+DynamicSymbols:
+  - Name:            strcmp
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+...
diff --git a/llvm/test/tools/llvm-objdump/ELF/program-headers.test b/llvm/test/tools/llvm-objdump/ELF/program-headers.test
index abd4894c45124..a6be34696a87f 100644
--- a/llvm/test/tools/llvm-objdump/ELF/program-headers.test
+++ b/llvm/test/tools/llvm-objdump/ELF/program-headers.test
@@ -4,51 +4,53 @@
 # RUN: llvm-objdump --private-headers %t32.elf | FileCheck %s --check-prefixes=ELF32 
 
 # ELF32:      Program Header:
-# ELF32-NEXT:    PHDR off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:    PHDR off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags -w-
-# ELF32-NEXT:    PHDR off    0x00000317 vaddr 0x00002000 paddr 0x00002000 align 2**0
+# ELF32-NEXT:    PHDR off    0x00000337 vaddr 0x00002000 paddr 0x00002000 align 2**0
 # ELF32-NEXT:         filesz 0x00000007 memsz 0x00000007 flags --x
-# ELF32-NEXT: UNKNOWN off    0x00000317 vaddr 0x00002000 paddr 0x00002000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000337 vaddr 0x00002000 paddr 0x00002000 align 2**0
 # ELF32-NEXT:         filesz 0x00000007 memsz 0x00000007 flags --x
-# ELF32-NEXT: DYNAMIC off    0x00000324 vaddr 0x00006000 paddr 0x00006000 align 2**0
+# ELF32-NEXT: DYNAMIC off    0x00000344 vaddr 0x00006000 paddr 0x00006000 align 2**0
 # ELF32-NEXT:         filesz 0x00000010 memsz 0x00000010 flags rwx
-# ELF32-NEXT:  INTERP off    0x0000031e vaddr 0x00003000 paddr 0x00003000 align 2**0
+# ELF32-NEXT:  INTERP off    0x0000033e vaddr 0x00003000 paddr 0x00003000 align 2**0
 # ELF32-NEXT:         filesz 0x00000004 memsz 0x00000004 flags rw-
-# ELF32-NEXT:    NOTE off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:    NOTE off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000001 memsz 0x00000001 flags ---
-# ELF32-NEXT:    TLS off    0x00000322 vaddr 0x00004000 paddr 0x00004000 align 2**0
+# ELF32-NEXT:    TLS off     0x00000342 vaddr 0x00004000 paddr 0x00004000 align 2**0
 # ELF32-NEXT:         filesz 0x00000001 memsz 0x00000001 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT:EH_FRAME off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:EH_FRAME off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT:   STACK off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:   STACK off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT:   RELRO off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:   RELRO off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT:PROPERTY off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:PROPERTY off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: OPENBSD_RANDOMIZE off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:  SFRAME off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: OPENBSD_WXNEEDED off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: OPENBSD_RANDOMIZE off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: OPENBSD_BOOTDATA off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: OPENBSD_WXNEEDED off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: OPENBSD_BOOTDATA off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
-# ELF32-NEXT: UNKNOWN off    0x00000314 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
+# ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
+# ELF32-NEXT: UNKNOWN off    0x00000334 vaddr 0x00001000 paddr 0x00001000 align 2**0
 # ELF32-NEXT:         filesz 0x00000003 memsz 0x00000003 flags ---
 # ELF32-EMPTY:
 
@@ -56,51 +58,53 @@
 # RUN: llvm-objdump --private-headers %t64.elf | FileCheck %s --check-prefixes=ELF64
 
 # ELF64:      Program Header:
-# ELF64-NEXT:    PHDR off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT:    PHDR off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags -w-
-# ELF64-NEXT:    PHDR off    0x000000000000054b vaddr 0x0000000000002000 paddr 0x0000000000002000 align 2**0
+# ELF64-NEXT:    PHDR off    0x0000000000000583 vaddr 0x0000000000002000 paddr 0x0000000000002000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000007 memsz 0x0000000000000007 flags --x
-# ELF64-NEXT: UNKNOWN off    0x000000000000054b vaddr 0x0000000000002000 paddr 0x0000000000002000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000583 vaddr 0x0000000000002000 paddr 0x0000000000002000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000007 memsz 0x0000000000000007 flags --x
-# ELF64-NEXT: DYNAMIC off    0x0000000000000558 vaddr 0x0000000000006000 paddr 0x0000000000006000 align 2**0
+# ELF64-NEXT: DYNAMIC off    0x0000000000000590 vaddr 0x0000000000006000 paddr 0x0000000000006000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000020 memsz 0x0000000000000020 flags rwx
-# ELF64-NEXT:  INTERP off    0x0000000000000552 vaddr 0x0000000000003000 paddr 0x0000000000003000 align 2**0
+# ELF64-NEXT:  INTERP off    0x000000000000058a vaddr 0x0000000000003000 paddr 0x0000000000003000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000004 memsz 0x0000000000000004 flags rw-
-# ELF64-NEXT:    NOTE off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT:    NOTE off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000001 memsz 0x0000000000000001 flags ---
-# ELF64-NEXT:    TLS off    0x0000000000000556 vaddr 0x0000000000004000 paddr 0x0000000000004000 align 2**0
+# ELF64-NEXT:    TLS off     0x000000000000058e vaddr 0x0000000000004000 paddr 0x0000000000004000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000001 memsz 0x0000000000000001 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
+# ELF64-NEXT:EH_FRAME off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT:EH_FRAME off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT:   STACK off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT:   STACK off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT:   RELRO off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT:   RELRO off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT:   PROPERTY off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT:   PROPERTY off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT:  SFRAME off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: OPENBSD_RANDOMIZE off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: OPENBSD_RANDOMIZE off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: OPENBSD_WXNEEDED off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: OPENBSD_WXNEEDED off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: OPENBSD_BOOTDATA off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: OPENBSD_BOOTDATA off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
-# ELF64-NEXT: UNKNOWN off    0x0000000000000548 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
+# ELF64-NEXT: UNKNOWN off    0x0000000000000580 vaddr 0x0000000000001000 paddr 0x0000000000001000 align 2**0
 # ELF64-NEXT:         filesz 0x0000000000000003 memsz 0x0000000000000003 flags ---
 # ELF64-EMPTY:
 
@@ -229,47 +233,52 @@ ProgramHeaders:
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 15: the PT_OPENBSD_RANDOMIZE segment.
+## Case 15: the PT_GNU_SFRAME segment.
+  - Type:     0x6474e554 ## PT_GNU_SFRAME
+    VAddr:    0x1000
+    FirstSec: .foo.begin
+    LastSec: .foo.end
+## Case 16: the PT_OPENBSD_RANDOMIZE segment.
   - Type:     0x65a3dbe6 ## PT_OPENBSD_RANDOMIZE
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 16: the PT_OPENBSD_WXNEEDED segment.
+## Case 17: the PT_OPENBSD_WXNEEDED segment.
   - Type:     0x65a3dbe7 ## PT_OPENBSD_WXNEEDED
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 17: the PT_OPENBSD_BOOTDATA segment.
+## Case 18: the PT_OPENBSD_BOOTDATA segment.
   - Type:     0x65a41be6 ## PT_OPENBSD_BOOTDATA
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 18: the PT_HIOS segment.
+## Case 19: the PT_HIOS segment.
   - Type:     0x6fffffff ## PT_HIOS
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 19: the PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO segment.
+## Case 20: the PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO segment.
   - Type:     0x70000000 ## PT_LOPROC/PT_ARM_ARCHEXT/PT_MIPS_REGINFO
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 20: the PT_ARM_EXIDX/PT_MIPS_RTPROC segment.
+## Case 21: the PT_ARM_EXIDX/PT_MIPS_RTPROC segment.
   - Type:     0x70000001 ## PT_ARM_EXIDX, PT_MIPS_RTPROC
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 20: the PT_MIPS_OPTIONS segment.
+## Case 22: the PT_MIPS_OPTIONS segment.
   - Type:     0x70000002 ## PT_MIPS_OPTIONS
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 21: the PT_MIPS_ABIFLAGS segment.
+## Case 23: the PT_MIPS_ABIFLAGS segment.
   - Type:     0x70000003 ## PT_MIPS_ABIFLAGS
     VAddr:    0x1000
     FirstSec: .foo.begin
     LastSec:  .foo.end
-## Case 22: the PT_HIPROC segment.
+## Case 24: the PT_HIPROC segment.
   - Type:     0x7fffffff ## PT_HIPROC
     VAddr:    0x1000
     FirstSec: .foo.begin
diff --git a/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test b/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test
new file mode 100644
index 0000000000000..0b8c33d24396a
--- /dev/null
+++ b/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test
@@ -0,0 +1,70 @@
+RUN: not %llvm-original-di-preservation %p/Inputs/sample.json --acceptance-test | FileCheck %s
+CHECK:      DILocation Bugs:
+CHECK-NEXT:   test.ll:
+CHECK-NEXT:     no-name:
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn
+CHECK-NEXT:       instr: extractvalue
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn
+CHECK-NEXT:       instr: insertvalue
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn1
+CHECK-NEXT:       instr: insertvalue
+CHECK-NEXT:     - action: not-generate
+CHECK-NEXT:       bb_name: no-name
+CHECK-NEXT:       fn_name: fn1
+CHECK-NEXT:       instr: extractvalue
+CHECK:      Errors detected for:
+
+RUN: not %llvm-original-di-preservation %p/Inputs/sample.json --acceptance-test --reduce | FileCheck %s --check-prefix=COMPRESS
+COMPRESS:      DILocation Bugs:
+COMPRESS-NEXT:   test.ll:
+COMPRESS-NEXT:     no-name:
+COMPRESS-NEXT:     - action: not-generate
+COMPRESS-NEXT:       bb_name: no-name
+COMPRESS-NEXT:       fn_name: fn
+COMPRESS-NEXT:       instr: extractvalue
+COMPRESS-NEXT:     - action: not-generate
+COMPRESS-NEXT:       bb_name: no-name
+COMPRESS-NEXT:       fn_name: fn
+COMPRESS-NEXT:       instr: insertvalue
+COMPRESS:      Errors detected for:
+
+RUN: not %llvm-original-di-preservation %p/Inputs/origin.json --acceptance-test --reduce | FileCheck %s --check-prefix=ORIGIN
+ORIGIN:      DILocation Bugs:
+ORIGIN-NEXT:   test.ll:
+ORIGIN-NEXT:     LoopVectorizePass:
+ORIGIN-NEXT:     - action: not-generate
+ORIGIN-NEXT:       bb_name: no-name
+ORIGIN-NEXT:       fn_name: fn
+ORIGIN-NEXT:       instr: add
+ORIGIN-NEXT:       origin: |
+ORIGIN-NEXT:         Stack Trace 0:
+ORIGIN-NEXT:          #0 0x00005895d035c935 llvm::DbgLocOrigin::DbgLocOrigin(bool) /tmp/llvm-project/llvm/lib/IR/DebugLoc.cpp:22:9
+ORIGIN-NEXT:          #1 0x00005895d03af013 llvm::DILocAndCoverageTracking::DILocAndCoverageTracking() /tmp/llvm-project/llvm/include/llvm/IR/DebugLoc.h:90:11
+ORIGIN-NEXT:          #2 0x00005895d03af013 llvm::DebugLoc::DebugLoc() /tmp/llvm-project/llvm/include/llvm/IR/DebugLoc.h:133:5
+ORIGIN-NEXT:          #3 0x00005895d03af013 llvm::Instruction::Instruction(llvm::Type*, unsigned int, llvm::User::AllocInfo, llvm::InsertPosition) /tmp/llvm-project/llvm/lib/IR/Instruction.cpp:37:14
+ORIGIN-NEXT:          #4 0x00005895d06862b5 llvm::PHINode::PHINode(llvm::Type*, unsigned int, llvm::Twine const&, llvm::InsertPosition) /tmp/llvm-project/llvm/include/llvm/IR/Instructions.h:0:9
+ORIGIN-NEXT:          #5 0x00005895d06862b5 llvm::PHINode::Create(llvm::Type*, unsigned int, llvm::Twine const&, llvm::InsertPosition) /tmp/llvm-project/llvm/include/llvm/IR/Instructions.h:2651:9
+ORIGIN-NEXT:          #6 0x00005895d06862b5 llvm::InstCombinerImpl::foldPHIArgGEPIntoPHI(llvm::PHINode&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp:617:9
+ORIGIN-NEXT:          #7 0x00005895d0688fe0 llvm::InstCombinerImpl::visitPHINode(llvm::PHINode&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp:1456:22
+ORIGIN-NEXT:          #8 0x00005895d05cd21f llvm::InstCombinerImpl::run() /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5327:22
+ORIGIN-NEXT:          #9 0x00005895d05d067e combineInstructionsOverFunction(llvm::Function&, llvm::InstructionWorklist&, llvm::AAResults*, llvm::AssumptionCache&, llvm::TargetLibraryInfo&, llvm::TargetTransformInfo&, llvm::DominatorTree&, llvm::OptimizationRemarkEmitter&, llvm::BlockFrequencyInfo*, llvm::BranchProbabilityInfo*, llvm::ProfileSummaryInfo*, llvm::InstCombineOptions const&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5643:31
+ORIGIN-NEXT:         #10 0x00005895d05cf9a9 llvm::InstCombinePass::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5706:8
+ORIGIN-NEXT:         #11 0x00005895d107d07d llvm::detail::PassModel>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #12 0x00005895d04204a7 llvm::PassManager>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerImpl.h:85:8
+ORIGIN-NEXT:         #13 0x00005895ce4cb09d llvm::detail::PassModel>, llvm::AnalysisManager>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #14 0x00005895cfae2865 llvm::CGSCCToFunctionPassAdaptor::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:0:38
+ORIGIN-NEXT:         #15 0x00005895ce4cad5d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #16 0x00005895cfade813 llvm::PassManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:93:12
+ORIGIN-NEXT:         #17 0x00005895d1e3968d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>, llvm::AnalysisManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN-NEXT:         #18 0x00005895cfae1224 llvm::DevirtSCCRepeatedPass::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:0:38
+ORIGIN-NEXT:         #19 0x00005895d1e5067d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5
+ORIGIN:      Errors detected for:
+
+RUN: %llvm-original-di-preservation %p/Inputs/non-existent.json --acceptance-test | FileCheck %s --check-prefix=EMPTY
+EMPTY: No errors detected for:
diff --git a/llvm/test/tools/llvm-original-di-preservation/basic.test b/llvm/test/tools/llvm-original-di-preservation/basic.test
index 5ef670b42c667..df43fbb3b5b9f 100644
--- a/llvm/test/tools/llvm-original-di-preservation/basic.test
+++ b/llvm/test/tools/llvm-original-di-preservation/basic.test
@@ -1,17 +1,17 @@
-RUN: %llvm-original-di-preservation %p/Inputs/sample.json %t.html | FileCheck %s
+RUN: %llvm-original-di-preservation %p/Inputs/sample.json --report-html-file %t.html | FileCheck %s
 RUN: diff -w %p/Inputs/expected-sample.html %t.html
 CHECK: The {{.+}}.html generated.
 CHECK-NOT: Skipped lines:
 
-RUN: %llvm-original-di-preservation %p/Inputs/corrupted.json %t2.html | FileCheck %s -check-prefix=CORRUPTED
+RUN: %llvm-original-di-preservation %p/Inputs/corrupted.json --report-html-file %t2.html | FileCheck %s -check-prefix=CORRUPTED
 RUN: diff -w %p/Inputs/expected-skipped.html %t2.html
 CORRUPTED: Skipped lines: 3
 CORRUPTED: Skipped bugs: 1
 
-RUN: %llvm-original-di-preservation -compress %p/Inputs/sample.json %t3.html | FileCheck %s -check-prefix=COMPRESSED
+RUN: %llvm-original-di-preservation --reduce %p/Inputs/sample.json --report-html-file %t3.html | FileCheck %s -check-prefix=REDUCE
 RUN: diff -w %p/Inputs/expected-compressed.html %t3.html
-COMPRESSED: The {{.+}}.html generated.
-COMPRESSED-NOT: Skipped lines:
+REDUCE: The {{.+}}.html generated.
+REDUCE-NOT: Skipped lines:
 
-RUN: %llvm-original-di-preservation %p/Inputs/origin.json %t4.html | FileCheck %s
+RUN: %llvm-original-di-preservation %p/Inputs/origin.json --report-html-file %t4.html | FileCheck %s
 RUN: diff -w %p/Inputs/expected-origin.html %t4.html
diff --git a/llvm/test/tools/llvm-readobj/ELF/section-types.test b/llvm/test/tools/llvm-readobj/ELF/section-types.test
index a428666c6ada9..e3a66832fa0d2 100644
--- a/llvm/test/tools/llvm-readobj/ELF/section-types.test
+++ b/llvm/test/tools/llvm-readobj/ELF/section-types.test
@@ -63,6 +63,8 @@
 # LLVM: Type: SHT_LLVM_PART_PHDR
 # LLVM: Name: .llvm.lto
 # LLVM: Type: SHT_LLVM_LTO
+# LLVM: Name: gnu_sframe
+# LLVM: Type: SHT_GNU_SFRAME
 # LLVM: Name: gnu_attributes
 # LLVM: Type: SHT_GNU_ATTRIBUTES
 # LLVM: Name: gnu_hash
@@ -125,6 +127,7 @@
 # GNU-NEXT: part1                   LLVM_PART_EHDR
 # GNU-NEXT: .phdrs                  LLVM_PART_PHDR
 # GNU-NEXT: .llvm.lto               LLVM_LTO
+# GNU-NEXT: gnu_sframe              SFRAME
 # GNU-NEXT: gnu_attributes          ATTRIBUTES
 # GNU-NEXT: gnu_hash                GNU_HASH
 # GNU-NEXT: gnu_verdef              VERDEF
@@ -215,6 +218,8 @@ Sections:
     Type: SHT_LLVM_PART_PHDR
   - Name: .llvm.lto
     Type: SHT_LLVM_LTO
+  - Name: gnu_sframe
+    Type: SHT_GNU_SFRAME
   - Name: gnu_attributes
     Type: SHT_GNU_ATTRIBUTES
   - Name: gnu_hash
diff --git a/llvm/test/tools/yaml2obj/ELF/program-header.yaml b/llvm/test/tools/yaml2obj/ELF/program-header.yaml
index 4899102346af4..b2b7ead6e794b 100644
--- a/llvm/test/tools/yaml2obj/ELF/program-header.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/program-header.yaml
@@ -39,6 +39,7 @@ ProgramHeaders:
   - Type: PT_GNU_STACK
   - Type: PT_GNU_RELRO
   - Type: PT_GNU_PROPERTY
+  - Type: PT_GNU_SFRAME
 
 #CHECK:     ProgramHeaders [
 #CHECK-NEXT:   ProgramHeader {
@@ -78,6 +79,9 @@ ProgramHeaders:
 #CHECK-NEXT:  ProgramHeader {
 #CHECK-NEXT:    Type: PT_GNU_PROPERTY (0x6474E553)
 #CHECK:       }
+#CHECK-NEXT:  ProgramHeader {
+#CHECK-NEXT:    Type: PT_GNU_SFRAME (0x6474E554)
+#CHECK:       }
 #CHECK-NEXT:]
 
 ## Check we do not allow referencing sections that do not exist.
diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
new file mode 100644
index 0000000000000..a4cf9690e86b5
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Analysis
+  Core
+  IRReader
+  Support
+  )
+
+add_llvm_tool(llvm-ir2vec
+  llvm-ir2vec.cpp
+  )
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
new file mode 100644
index 0000000000000..e1e5fad13f413
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -0,0 +1,314 @@
+//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the IR2Vec embedding generation tool.
+///
+/// This tool provides two main functionalities:
+///
+/// 1. Triplet Generation Mode (--mode=triplets):
+///    Generates triplets (opcode, type, operands) for vocabulary training.
+///    Usage: llvm-ir2vec --mode=triplets input.bc -o triplets.txt
+///
+/// 2. Embedding Generation Mode (--mode=embeddings):
+///    Generates IR2Vec embeddings using a trained vocabulary.
+///    Usage: llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json
+///    --level=func input.bc -o embeddings.txt Levels: --level=inst
+///    (instructions), --level=bb (basic blocks), --level=func (functions)
+///    (See IR2Vec.cpp for more embedding generation options)
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "ir2vec"
+
+namespace llvm {
+namespace ir2vec {
+
+static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options");
+
+static cl::opt<std::string>
+    InputFilename(cl::Positional,
+                  cl::desc("<input bitcode file or '-' for stdin>"),
+                  cl::init("-"), cl::cat(IR2VecToolCategory));
+
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+                                           cl::value_desc("filename"),
+                                           cl::init("-"),
+                                           cl::cat(IR2VecToolCategory));
+
+enum ToolMode {
+  TripletMode,  // Generate triplets for vocabulary training
+  EmbeddingMode // Generate embeddings using trained vocabulary
+};
+
+static cl::opt<ToolMode>
+    Mode("mode", cl::desc("Tool operation mode:"),
+         cl::values(clEnumValN(TripletMode, "triplets",
+                               "Generate triplets for vocabulary training"),
+                    clEnumValN(EmbeddingMode, "embeddings",
+                               "Generate embeddings using trained vocabulary")),
+         cl::init(EmbeddingMode), cl::cat(IR2VecToolCategory));
+
+static cl::opt<std::string>
+    FunctionName("function", cl::desc("Process specific function only"),
+                 cl::value_desc("name"), cl::Optional, cl::init(""),
+                 cl::cat(IR2VecToolCategory));
+
+enum EmbeddingLevel {
+  InstructionLevel, // Generate instruction-level embeddings
+  BasicBlockLevel,  // Generate basic block-level embeddings
+  FunctionLevel     // Generate function-level embeddings
+};
+
+static cl::opt<EmbeddingLevel>
+    Level("level", cl::desc("Embedding generation level (for embedding mode):"),
+          cl::values(clEnumValN(InstructionLevel, "inst",
+                                "Generate instruction-level embeddings"),
+                     clEnumValN(BasicBlockLevel, "bb",
+                                "Generate basic block-level embeddings"),
+                     clEnumValN(FunctionLevel, "func",
+                                "Generate function-level embeddings")),
+          cl::init(FunctionLevel), cl::cat(IR2VecToolCategory));
+
+namespace {
+
+/// Helper class for collecting IR triplets and generating embeddings
+class IR2VecTool {
+private:
+  Module &M;
+  ModuleAnalysisManager MAM;
+  const Vocabulary *Vocab = nullptr;
+
+public:
+  explicit IR2VecTool(Module &M) : M(M) {}
+
+  /// Initialize the IR2Vec vocabulary analysis
+  bool initializeVocabulary() {
+    // Register and run the IR2Vec vocabulary analysis
+    // The vocabulary file path is specified via --ir2vec-vocab-path global
+    // option
+    MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
+    MAM.registerPass([&] { return IR2VecVocabAnalysis(); });
+    Vocab = &MAM.getResult<IR2VecVocabAnalysis>(M);
+    return Vocab->isValid();
+  }
+
+  /// Generate triplets for the entire module
+  void generateTriplets(raw_ostream &OS) const {
+    for (const Function &F : M)
+      generateTriplets(F, OS);
+  }
+
+  /// Generate triplets for a single function
+  void generateTriplets(const Function &F, raw_ostream &OS) const {
+    if (F.isDeclaration())
+      return;
+
+    std::string LocalOutput;
+    raw_string_ostream LocalOS(LocalOutput);
+
+    for (const BasicBlock &BB : F)
+      traverseBasicBlock(BB, LocalOS);
+
+    LocalOS.flush();
+    OS << LocalOutput;
+  }
+
+  /// Generate embeddings for the entire module
+  void generateEmbeddings(raw_ostream &OS) const {
+    if (!Vocab->isValid()) {
+      OS << "Error: Vocabulary is not valid. IR2VecTool not initialized.\n";
+      return;
+    }
+
+    for (const Function &F : M)
+      generateEmbeddings(F, OS);
+  }
+
+  /// Generate embeddings for a single function
+  void generateEmbeddings(const Function &F, raw_ostream &OS) const {
+    if (F.isDeclaration()) {
+      OS << "Function " << F.getName() << " is a declaration, skipping.\n";
+      return;
+    }
+
+    // Create embedder for this function
+    assert(Vocab->isValid() && "Vocabulary is not valid");
+    auto Emb = Embedder::create(IR2VecKind::Symbolic, F, *Vocab);
+    if (!Emb) {
+      OS << "Error: Failed to create embedder for function " << F.getName()
+         << "\n";
+      return;
+    }
+
+    OS << "Function: " << F.getName() << "\n";
+
+    // Generate embeddings based on the specified level
+    switch (Level) {
+    case FunctionLevel: {
+      Emb->getFunctionVector().print(OS);
+      break;
+    }
+    case BasicBlockLevel: {
+      const auto &BBVecMap = Emb->getBBVecMap();
+      for (const BasicBlock &BB : F) {
+        auto It = BBVecMap.find(&BB);
+        if (It != BBVecMap.end()) {
+          OS << BB.getName() << ":";
+          It->second.print(OS);
+        }
+      }
+      break;
+    }
+    case InstructionLevel: {
+      const auto &InstMap = Emb->getInstVecMap();
+      for (const BasicBlock &BB : F) {
+        for (const Instruction &I : BB) {
+          auto It = InstMap.find(&I);
+          if (It != InstMap.end()) {
+            I.print(OS);
+            It->second.print(OS);
+          }
+        }
+      }
+      break;
+    }
+    }
+  }
+
+private:
+  /// Process a single basic block for triplet generation
+  void traverseBasicBlock(const BasicBlock &BB, raw_string_ostream &OS) const {
+    // Consider only non-debug and non-pseudo instructions
+    for (const auto &I : BB.instructionsWithoutDebug()) {
+      StringRef OpcStr = Vocabulary::getVocabKeyForOpcode(I.getOpcode());
+      StringRef TypeStr =
+          Vocabulary::getVocabKeyForTypeID(I.getType()->getTypeID());
+
+      OS << '\n' << OpcStr << ' ' << TypeStr << ' ';
+
+      LLVM_DEBUG({
+        I.print(dbgs());
+        dbgs() << "\n";
+        I.getType()->print(dbgs());
+        dbgs() << " Type\n";
+      });
+
+      for (const Use &U : I.operands())
+        OS << Vocabulary::getVocabKeyForOperandKind(
+                  Vocabulary::getOperandKind(U.get()))
+           << ' ';
+    }
+  }
+};
+
+Error processModule(Module &M, raw_ostream &OS) {
+  IR2VecTool Tool(M);
+
+  if (Mode == EmbeddingMode) {
+    // Initialize vocabulary for embedding generation
+    // Note: Requires --ir2vec-vocab-path option to be set
+    if (!Tool.initializeVocabulary())
+      return createStringError(
+          errc::invalid_argument,
+          "Failed to initialize IR2Vec vocabulary. "
+          "Make sure to specify --ir2vec-vocab-path for embedding mode.");
+
+    if (!FunctionName.empty()) {
+      // Process single function
+      if (const Function *F = M.getFunction(FunctionName))
+        Tool.generateEmbeddings(*F, OS);
+      else
+        return createStringError(errc::invalid_argument,
+                                 "Function '%s' not found",
+                                 FunctionName.c_str());
+    } else {
+      // Process all functions
+      Tool.generateEmbeddings(OS);
+    }
+  } else {
+    // Triplet generation mode - no vocabulary needed
+    if (!FunctionName.empty())
+      // Process single function
+      if (const Function *F = M.getFunction(FunctionName))
+        Tool.generateTriplets(*F, OS);
+      else
+        return createStringError(errc::invalid_argument,
+                                 "Function '%s' not found",
+                                 FunctionName.c_str());
+    else
+      // Process all functions
+      Tool.generateTriplets(OS);
+  }
+  return Error::success();
+}
+} // namespace
+} // namespace ir2vec
+} // namespace llvm
+
+int main(int argc, char **argv) {
+  using namespace llvm;
+  using namespace llvm::ir2vec;
+
+  InitLLVM X(argc, argv);
+  cl::HideUnrelatedOptions(IR2VecToolCategory);
+  cl::ParseCommandLineOptions(
+      argc, argv,
+      "IR2Vec - Embedding Generation Tool\n"
+      "Generates embeddings for a given LLVM IR and "
+      "supports triplet generation for vocabulary "
+      "training and embedding generation.\n\n"
+      "See https://llvm.org/docs/CommandGuide/llvm-ir2vec.html for more "
+      "information.\n");
+
+  // Validate command line options
+  if (Mode == TripletMode && Level.getNumOccurrences() > 0)
+    errs() << "Warning: --level option is ignored in triplet mode\n";
+
+  // Parse the input LLVM IR file or stdin
+  SMDiagnostic Err;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
+  if (!M) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename, EC);
+  if (EC) {
+    errs() << "Error opening output file: " << EC.message() << "\n";
+    return 1;
+  }
+
+  if (Error Err = processModule(*M, OS)) {
+    handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+      errs() << "Error: " << EIB.message() << "\n";
+    });
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 0d209590655ef..175f77c894825 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -308,6 +308,8 @@ static const StringMap<MachineInfo> TargetMap{
     // RISC-V
     {"elf32-littleriscv", {ELF::EM_RISCV, false, true}},
     {"elf64-littleriscv", {ELF::EM_RISCV, true, true}},
+    {"elf32-bigriscv", {ELF::EM_RISCV, false, false}},
+    {"elf64-bigriscv", {ELF::EM_RISCV, true, false}},
     // PowerPC
     {"elf32-powerpc", {ELF::EM_PPC, false, false}},
     {"elf32-powerpcle", {ELF::EM_PPC, false, true}},
diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp
index 2503008929f98..1e7bb00916cd7 100644
--- a/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -291,6 +291,9 @@ template <class ELFT> void ELFDumper<ELFT>::printProgramHeaders() {
     case ELF::PT_GNU_STACK:
       outs() << "   STACK ";
       break;
+    case ELF::PT_GNU_SFRAME:
+      outs() << "  SFRAME ";
+      break;
     case ELF::PT_INTERP:
       outs() << "  INTERP ";
       break;
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 101079f09e1d2..465c189680cae 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1490,6 +1490,7 @@ static StringRef segmentTypeToString(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_STACK);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_RELRO);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_PROPERTY);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_SFRAME);
 
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_MUTABLE);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_RANDOMIZE);
diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index 9a2015f61f2bf..53003d90160fe 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -41,6 +41,15 @@ if(LLVM_BUILD_LLVM_DYLIB)
     llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} $<TARGET_FILE_NAME:LLVM> SHARED FULL_DEST COMPONENT LLVM)
   endif()
 
+  if (MINGW OR CYGWIN)
+    # The LLVM DLL is supposed to export all symbols (except for ones
+    # that are explicitly hidden). Normally, this is what happens anyway, but
+    # if there are symbols that are marked explicitly as dllexport, we'd only
+    # export them and nothing else. Therefore, add --export-all-symbols to
+    # make sure we export all symbols despite potential dllexports.
+    target_link_options(LLVM PRIVATE LINKER:--export-all-symbols)
+  endif()
+
   list(REMOVE_DUPLICATES LIB_NAMES)
   if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
     set(LIB_NAMES -Wl,-all_load ${LIB_NAMES})
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index cb6d633306a81..7c9a5464bfe1d 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
   }
 }
 
+TEST(IR2VecVocabularyTest, NumericIDMap) {
+  // Test getNumericID for opcodes
+  EXPECT_EQ(Vocabulary::getNumericID(1u), 0u);
+  EXPECT_EQ(Vocabulary::getNumericID(13u), 12u);
+  EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1);
+
+  // Test getNumericID for Type IDs
+  EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::VoidTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::HalfTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::FloatTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::IntegerTyID));
+  EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID),
+            MaxOpcodes + static_cast<unsigned>(Type::PointerTyID));
+
+  // Test getNumericID for Value operands
+  LLVMContext Ctx;
+  Module M("TestM", Ctx);
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", M);
+
+  // Test Function operand
+  EXPECT_EQ(Vocabulary::getNumericID(F),
+            MaxOpcodes + MaxTypeIDs + 0u); // Function = 0
+
+  // Test Constant operand
+  Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
+  EXPECT_EQ(Vocabulary::getNumericID(C),
+            MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2
+
+  // Test Pointer operand
+  BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+  AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
+  EXPECT_EQ(Vocabulary::getNumericID(PtrVal),
+            MaxOpcodes + MaxTypeIDs + 1u); // Pointer = 1
+
+  // Test Variable operand (function argument)
+  Argument *Arg = F->getArg(0);
+  EXPECT_EQ(Vocabulary::getNumericID(Arg),
+            MaxOpcodes + MaxTypeIDs + 3u); // Variable = 3
+}
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(IR2VecVocabularyTest, NumericIDMapInvalidInputs) {
+  // Test invalid opcode IDs
+  EXPECT_DEATH(Vocabulary::getNumericID(0u), "Invalid opcode");
+  EXPECT_DEATH(Vocabulary::getNumericID(MaxOpcodes + 1), "Invalid opcode");
+
+  // Test invalid type IDs
+  EXPECT_DEATH(Vocabulary::getNumericID(static_cast<Type::TypeID>(MaxTypeIDs)),
+               "Invalid type ID");
+  EXPECT_DEATH(
+      Vocabulary::getNumericID(static_cast<Type::TypeID>(MaxTypeIDs + 10)),
+      "Invalid type ID");
+}
+#endif // NDEBUG
+#endif // GTEST_HAS_DEATH_TEST
+
 TEST(IR2VecVocabularyTest, StringKeyGeneration) {
   EXPECT_EQ(Vocabulary::getStringKey(0), "Ret");
   EXPECT_EQ(Vocabulary::getStringKey(12), "Add");
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 7a48105a1dc99..4b476551f63d9 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -912,8 +912,8 @@ TEST(ValueTracking, propagatesPoison) {
       {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 2},
       {true, "call float @llvm.sqrt.f32(float %fx)", 0},
       {true, "call float @llvm.powi.f32.i32(float %fx, i32 %x)", 0},
-      {false, "call float @llvm.sin.f32(float %fx)", 0},
-      {false, "call float @llvm.cos.f32(float %fx)", 0},
+      {true, "call float @llvm.sin.f32(float %fx)", 0},
+      {true, "call float @llvm.cos.f32(float %fx)", 0},
       {true, "call float @llvm.pow.f32(float %fx, float %fy)", 0},
       {false, "call float @llvm.exp.f32(float %fx)", 0},
       {false, "call float @llvm.exp2.f32(float %fx)", 0},
diff --git a/llvm/unittests/BinaryFormat/CMakeLists.txt b/llvm/unittests/BinaryFormat/CMakeLists.txt
index 40d3bc4dca0b6..eac5977a2c1c3 100644
--- a/llvm/unittests/BinaryFormat/CMakeLists.txt
+++ b/llvm/unittests/BinaryFormat/CMakeLists.txt
@@ -10,6 +10,7 @@ add_llvm_unittest(BinaryFormatTests
   MsgPackDocumentTest.cpp
   MsgPackReaderTest.cpp
   MsgPackWriterTest.cpp
+  SFrameTest.cpp
   TestFileMagic.cpp
   )
 
diff --git a/llvm/unittests/BinaryFormat/SFrameTest.cpp b/llvm/unittests/BinaryFormat/SFrameTest.cpp
new file mode 100644
index 0000000000000..394e382e041e9
--- /dev/null
+++ b/llvm/unittests/BinaryFormat/SFrameTest.cpp
@@ -0,0 +1,118 @@
+//===- SFrameTest.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/SFrame.h"
+#include "gtest/gtest.h"
+#include <type_traits>
+
+using namespace llvm;
+using namespace llvm::sframe;
+
+namespace {
+
+template <typename EndianT> class SFrameTest : public testing::Test {
+protected:
+  static constexpr endianness Endian = EndianT::value;
+
+  // Test structure sizes and triviality.
+  static_assert(std::is_trivial_v<Preamble<Endian>>);
+  static_assert(sizeof(Preamble<Endian>) == 4);
+
+  static_assert(std::is_trivial_v<Header<Endian>>);
+  static_assert(sizeof(Header<Endian>) == 28);
+
+  static_assert(std::is_trivial_v<FuncDescEntry<Endian>>);
+  static_assert(sizeof(FuncDescEntry<Endian>) == 20);
+
+  static_assert(std::is_trivial_v<FrameRowEntryAddr1<Endian>>);
+  static_assert(sizeof(FrameRowEntryAddr1<Endian>) == 2);
+
+  static_assert(std::is_trivial_v<FrameRowEntryAddr2<Endian>>);
+  static_assert(sizeof(FrameRowEntryAddr2<Endian>) == 3);
+
+  static_assert(std::is_trivial_v<FrameRowEntryAddr4<Endian>>);
+  static_assert(sizeof(FrameRowEntryAddr4<Endian>) == 5);
+};
+
+struct NameGenerator {
+  template <typename T> static constexpr const char *GetName(int) {
+    if constexpr (T::value == endianness::little)
+      return "little";
+    if constexpr (T::value == endianness::big)
+      return "big";
+  }
+};
+using Types =
+    testing::Types<std::integral_constant<endianness, endianness::little>,
+                   std::integral_constant<endianness, endianness::big>>;
+TYPED_TEST_SUITE(SFrameTest, Types, NameGenerator);
+
+TYPED_TEST(SFrameTest, FDEFlags) {
+  FuncDescEntry<TestFixture::Endian> FDE = {};
+  EXPECT_EQ(FDE.Info, 0u);
+  EXPECT_EQ(FDE.getPAuthKey(), 0);
+  EXPECT_EQ(FDE.getFDEType(), FDEType::PCInc);
+  EXPECT_EQ(FDE.getFREType(), FREType::Addr1);
+
+  FDE.setPAuthKey(1);
+  EXPECT_EQ(FDE.Info, 0x20u);
+  EXPECT_EQ(FDE.getPAuthKey(), 1);
+  EXPECT_EQ(FDE.getFDEType(), FDEType::PCInc);
+  EXPECT_EQ(FDE.getFREType(), FREType::Addr1);
+
+  FDE.setFDEType(FDEType::PCMask);
+  EXPECT_EQ(FDE.Info, 0x30u);
+  EXPECT_EQ(FDE.getPAuthKey(), 1);
+  EXPECT_EQ(FDE.getFDEType(), FDEType::PCMask);
+  EXPECT_EQ(FDE.getFREType(), FREType::Addr1);
+
+  FDE.setFREType(FREType::Addr4);
+  EXPECT_EQ(FDE.Info, 0x32u);
+  EXPECT_EQ(FDE.getPAuthKey(), 1);
+  EXPECT_EQ(FDE.getFDEType(), FDEType::PCMask);
+  EXPECT_EQ(FDE.getFREType(), FREType::Addr4);
+}
+
+TYPED_TEST(SFrameTest, FREFlags) {
+  FREInfo<TestFixture::Endian> Info = {};
+  EXPECT_EQ(Info.Info, 0u);
+  EXPECT_FALSE(Info.isReturnAddressSigned());
+  EXPECT_EQ(Info.getOffsetSize(), FREOffset::B1);
+  EXPECT_EQ(Info.getOffsetCount(), 0u);
+  EXPECT_EQ(Info.getBaseRegister(), BaseReg::FP);
+
+  Info.setReturnAddressSigned(true);
+  EXPECT_EQ(Info.Info, 0x80u);
+  EXPECT_TRUE(Info.isReturnAddressSigned());
+  EXPECT_EQ(Info.getOffsetSize(), FREOffset::B1);
+  EXPECT_EQ(Info.getOffsetCount(), 0u);
+  EXPECT_EQ(Info.getBaseRegister(), BaseReg::FP);
+
+  Info.setOffsetSize(FREOffset::B4);
+  EXPECT_EQ(Info.Info, 0xc0u);
+  EXPECT_TRUE(Info.isReturnAddressSigned());
+  EXPECT_EQ(Info.getOffsetSize(), FREOffset::B4);
+  EXPECT_EQ(Info.getOffsetCount(), 0u);
+  EXPECT_EQ(Info.getBaseRegister(), BaseReg::FP);
+
+  Info.setOffsetCount(3);
+  EXPECT_EQ(Info.Info, 0xc6u);
+  EXPECT_TRUE(Info.isReturnAddressSigned());
+  EXPECT_EQ(Info.getOffsetSize(), FREOffset::B4);
+  EXPECT_EQ(Info.getOffsetCount(), 3u);
+  EXPECT_EQ(Info.getBaseRegister(), BaseReg::FP);
+
+  Info.setBaseRegister(BaseReg::SP);
+  EXPECT_EQ(Info.Info, 0xc7u);
+  EXPECT_TRUE(Info.isReturnAddressSigned());
+  EXPECT_EQ(Info.getOffsetSize(), FREOffset::B4);
+  EXPECT_EQ(Info.getOffsetCount(), 3u);
+  EXPECT_EQ(Info.getBaseRegister(), BaseReg::SP);
+}
+
+} // namespace
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
index b1890d884d173..dfabb4ab76180 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
@@ -8,6 +8,7 @@
 
 #include <llvm/BinaryFormat/ELF.h>
 #include <llvm/ExecutionEngine/JITLink/aarch32.h>
+#include <llvm/Support/Compiler.h>
 
 #include "gtest/gtest.h"
 
@@ -96,21 +97,21 @@ namespace llvm {
 namespace jitlink {
 namespace aarch32 {
 
-HalfWords encodeImmBT4BlT1BlxT2(int64_t Value);
-HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value);
-uint32_t encodeImmBA1BlA1BlxA2(int64_t Value);
-HalfWords encodeImmMovtT1MovwT3(uint16_t Value);
-HalfWords encodeRegMovtT1MovwT3(int64_t Value);
-uint32_t encodeImmMovtA1MovwA2(uint16_t Value);
-uint32_t encodeRegMovtA1MovwA2(int64_t Value);
-
-int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo);
-int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo);
-int64_t decodeImmBA1BlA1BlxA2(int64_t Value);
-uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
-int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
-uint16_t decodeImmMovtA1MovwA2(uint64_t Value);
-int64_t decodeRegMovtA1MovwA2(uint64_t Value);
+LLVM_ABI HalfWords encodeImmBT4BlT1BlxT2(int64_t Value);
+LLVM_ABI HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value);
+LLVM_ABI uint32_t encodeImmBA1BlA1BlxA2(int64_t Value);
+LLVM_ABI HalfWords encodeImmMovtT1MovwT3(uint16_t Value);
+LLVM_ABI HalfWords encodeRegMovtT1MovwT3(int64_t Value);
+LLVM_ABI uint32_t encodeImmMovtA1MovwA2(uint16_t Value);
+LLVM_ABI uint32_t encodeRegMovtA1MovwA2(int64_t Value);
+
+LLVM_ABI int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo);
+LLVM_ABI int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo);
+LLVM_ABI int64_t decodeImmBA1BlA1BlxA2(int64_t Value);
+LLVM_ABI uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
+LLVM_ABI int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo);
+LLVM_ABI uint16_t decodeImmMovtA1MovwA2(uint64_t Value);
+LLVM_ABI int64_t decodeRegMovtA1MovwA2(uint64_t Value);
 
 } // namespace aarch32
 } // namespace jitlink
diff --git a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
index 0363a08cc0f03..10329820bef76 100644
--- a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
@@ -48,12 +48,6 @@ static std::string &prepareParamName(std::string &Name) {
   return Name;
 }
 
-namespace llvm {
-template <> struct enum_iteration_traits<omp::Directive> {
-  static constexpr bool is_iterable = true;
-};
-} // namespace llvm
-
 // Test tokenizing.
 
 class Tokenize : public testing::TestWithParam<omp::Directive> {};
@@ -87,12 +81,10 @@ getParamName1(const testing::TestParamInfo<Tokenize::ParamType> &Info) {
   return prepareParamName(Name);
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    DirectiveNameParserTest, Tokenize,
-    testing::ValuesIn(
-        llvm::enum_seq(static_cast<omp::Directive>(0),
-                       static_cast<omp::Directive>(omp::Directive_enumSize))),
-    getParamName1);
+INSTANTIATE_TEST_SUITE_P(DirectiveNameParserTest, Tokenize,
+                         testing::ValuesIn(llvm::enum_seq_inclusive(
+                             omp::Directive::First_, omp::Directive::Last_)),
+                         getParamName1);
 
 // Test parsing of valid names.
 
@@ -131,9 +123,8 @@ getParamName2(const testing::TestParamInfo<ParseValid::ParamType> &Info) {
 
 INSTANTIATE_TEST_SUITE_P(
     DirectiveNameParserTest, ParseValid,
-    testing::Combine(testing::ValuesIn(llvm::enum_seq(
-                         static_cast<omp::Directive>(0),
-                         static_cast<omp::Directive>(omp::Directive_enumSize))),
+    testing::Combine(testing::ValuesIn(llvm::enum_seq_inclusive(
+                         omp::Directive::First_, omp::Directive::Last_)),
                      testing::ValuesIn(omp::getOpenMPVersions())),
     getParamName2);
 
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index be98be260c9dc..d6b578aa8ffd1 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -2242,23 +2242,34 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdIf) {
   PB.registerFunctionAnalyses(FAM);
   LoopInfo &LI = FAM.getResult<LoopAnalysis>(*F);
 
-  // Check if there are two loops (one with enabled vectorization)
+  // Check if there is one loop containing branches with and without
+  // vectorization
   const std::vector<Loop *> &TopLvl = LI.getTopLevelLoops();
-  EXPECT_EQ(TopLvl.size(), 2u);
+  EXPECT_EQ(TopLvl.size(), 1u);
 
   Loop *L = TopLvl[0];
   EXPECT_TRUE(findStringMetadataForLoop(L, "llvm.loop.parallel_accesses"));
-  EXPECT_TRUE(getBooleanLoopAttribute(L, "llvm.loop.vectorize.enable"));
-  EXPECT_EQ(getIntLoopAttribute(L, "llvm.loop.vectorize.width"), 3);
-
-  // The second loop should have disabled vectorization
-  L = TopLvl[1];
-  EXPECT_FALSE(findStringMetadataForLoop(L, "llvm.loop.parallel_accesses"));
+  // These attributes cannot not be set because the loop is shared between simd
+  // and non-simd versions
   EXPECT_FALSE(getBooleanLoopAttribute(L, "llvm.loop.vectorize.enable"));
-  // Check for llvm.access.group metadata attached to the printf
-  // function in the loop body.
+  EXPECT_EQ(getIntLoopAttribute(L, "llvm.loop.vectorize.width"), 0);
+
+  // Check for if condition
   BasicBlock *LoopBody = CLI->getBody();
-  EXPECT_TRUE(any_of(*LoopBody, [](Instruction &I) {
+  BranchInst *IfCond = cast<BranchInst>(LoopBody->getTerminator());
+  EXPECT_EQ(IfCond->getCondition(), IfCmp);
+  BasicBlock *TrueBranch = IfCond->getSuccessor(0);
+  BasicBlock *FalseBranch = IfCond->getSuccessor(1)->getUniqueSuccessor();
+
+  // Check for llvm.access.group metadata attached to the printf
+  // function in the true body.
+  EXPECT_TRUE(any_of(*TrueBranch, [](Instruction &I) {
+    return I.getMetadata("llvm.access.group") != nullptr;
+  }));
+
+  // Check for llvm.access.group metadata attached to the printf
+  // function in the false body.
+  EXPECT_FALSE(any_of(*FalseBranch, [](Instruction &I) {
     return I.getMetadata("llvm.access.group") != nullptr;
   }));
 }
@@ -2725,8 +2736,8 @@ TEST_P(OpenMPIRBuilderTestWithParams, DynamicWorkShareLoop) {
   EXPECT_EQ(OrigUpperBound->getValue(), 21);
   EXPECT_EQ(OrigStride->getValue(), 1);
 
-  CallInst *FiniCall = dyn_cast<CallInst>(
-      &*(LatchBlock->getTerminator()->getPrevNonDebugInstruction(true)));
+  CallInst *FiniCall =
+      dyn_cast<CallInst>(&*(LatchBlock->getTerminator()->getPrevNode()));
   EXPECT_EQ(FiniCall, nullptr);
 
   // The original loop iterator should only be used in the condition, in the
@@ -2829,8 +2840,8 @@ TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoopOrdered) {
   EXPECT_EQ(SchedVal->getValue(),
             static_cast<uint64_t>(OMPScheduleType::OrderedStaticChunked));
 
-  CallInst *FiniCall = dyn_cast<CallInst>(
-      &*(LatchBlock->getTerminator()->getPrevNonDebugInstruction(true)));
+  CallInst *FiniCall =
+      dyn_cast<CallInst>(&*(LatchBlock->getTerminator()->getPrevNode()));
   ASSERT_NE(FiniCall, nullptr);
   EXPECT_EQ(FiniCall->getCalledFunction()->getName(),
             "__kmpc_dispatch_fini_4u");
@@ -4644,7 +4655,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) {
 
   // Verifying that the next instruction to execute is kmpc_fork_teams
   BranchInst *BrInst =
-      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNode());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
   BasicBlock::iterator NextInstruction =
@@ -4701,7 +4712,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) {
 
   // Verifying that the next instruction to execute is kmpc_fork_teams
   BranchInst *BrInst =
-      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNode());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
   BasicBlock::iterator NextInstruction =
@@ -4761,7 +4772,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) {
 
   // Verifying that the next instruction to execute is kmpc_fork_teams
   BranchInst *BrInst =
-      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNode());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
   BasicBlock::iterator NextInstruction =
@@ -4827,7 +4838,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) {
 
   // Verifying that the next instruction to execute is kmpc_fork_teams
   BranchInst *BrInst =
-      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNode());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
   BasicBlock::iterator NextInstruction =
@@ -7338,8 +7349,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) {
   EXPECT_EQ(TaskBeginIfCall->getParent(),
             IfConditionBranchInst->getSuccessor(1));
 
-  EXPECT_EQ(TaskBeginIfCall->getNextNonDebugInstruction(), OulinedFnCall);
-  EXPECT_EQ(OulinedFnCall->getNextNonDebugInstruction(), TaskCompleteCall);
+  EXPECT_EQ(TaskBeginIfCall->getNextNode(), OulinedFnCall);
+  EXPECT_EQ(OulinedFnCall->getNextNode(), TaskCompleteCall);
 }
 
 TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) {
@@ -7423,12 +7434,12 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) {
                                            OMPRTL___kmpc_global_thread_num));
 
   // Checking the general structure of the IR generated is same as expected.
-  Instruction *GeneratedStoreInst = TaskgroupCall->getNextNonDebugInstruction();
+  Instruction *GeneratedStoreInst = TaskgroupCall->getNextNode();
   EXPECT_EQ(GeneratedStoreInst, InternalStoreInst);
   Instruction *GeneratedLoad32 =
-      GeneratedStoreInst->getNextNonDebugInstruction();
+      GeneratedStoreInst->getNextNode();
   EXPECT_EQ(GeneratedLoad32, InternalLoad32);
-  Instruction *GeneratedLoad128 = GeneratedLoad32->getNextNonDebugInstruction();
+  Instruction *GeneratedLoad128 = GeneratedLoad32->getNextNode();
   EXPECT_EQ(GeneratedLoad128, InternalLoad128);
 
   // Checking the ordering because of the if statements and that
diff --git a/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp b/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp
index b7f5234ffcda1..9bef77433bad5 100644
--- a/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp
+++ b/llvm/unittests/FuzzMutate/RandomIRBuilderTest.cpp
@@ -520,7 +520,7 @@ TEST(RandomIRBuilderTest, sinkToIntrinsic) {
   ASSERT_TRUE(Modified);
 
   Modified = false;
-  I = I->getNextNonDebugInstruction();
+  I = I->getNextNode();
   for (int i = 0; i < 20; i++) {
     Value *OldOperand = I->getOperand(0);
     Value *Src = F.getArg(5);
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 126db4d4c1625..fd8838e06f34c 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -1482,11 +1482,11 @@ TEST(InstructionsTest, SkipDebug) {
 
   // The first non-debug instruction is the terminator.
   auto *Term = BB.getTerminator();
-  EXPECT_EQ(Term, BB.begin()->getNextNonDebugInstruction());
+  EXPECT_EQ(Term, BB.begin()->getNextNode());
   EXPECT_EQ(Term->getIterator(), skipDebugIntrinsics(BB.begin()));
 
   // After the terminator, there are no non-debug instructions.
-  EXPECT_EQ(nullptr, Term->getNextNonDebugInstruction());
+  EXPECT_EQ(nullptr, Term->getNextNode());
 }
 
 TEST(InstructionsTest, PhiMightNotBeFPMathOperator) {
diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index ad9af2075e371..49af83609d98c 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -80,6 +80,32 @@ TEST(IntrinsicNameLookup, Basic) {
   EXPECT_EQ(memcpy_inline, lookupIntrinsicID("llvm.memcpy.inline.p0.p0.i1024"));
 }
 
+TEST(IntrinsicNameLookup, NonNullterminatedStringRef) {
+  using namespace Intrinsic;
+  // This reproduces an issue where lookupIntrinsicID() can access memory beyond
+  // the bounds of the passed in StringRef. For ASAN to catch this as an error,
+  // create a StringRef using heap allocated memory and make it not null
+  // terminated.
+
+  // ASAN will report a "AddressSanitizer: heap-buffer-overflow" error in
+  // `lookupLLVMIntrinsicByName` when LLVM is built with these options:
+  //  -DCMAKE_BUILD_TYPE=Debug
+  //  -DLLVM_USE_SANITIZER=Address
+  //  -DLLVM_OPTIMIZE_SANITIZED_BUILDS=OFF
+
+  // Make an intrinsic name "llvm.memcpy.inline" on the heap.
+  std::string Name = "llvm.memcpy.inline";
+  assert(Name.size() == 18);
+  // Create a StringRef backed by heap allocated memory such that OOB access
+  // in that StringRef can be flagged by asan. Here, the String `S` is of size
+  // 18, and backed by a heap allocated buffer `Data`, so access to S[18] will
+  // be flagged bby asan.
+  auto Data = std::make_unique<char[]>(Name.size());
+  std::strncpy(Data.get(), Name.data(), Name.size());
+  StringRef S(Data.get(), Name.size());
+  EXPECT_EQ(memcpy_inline, lookupIntrinsicID(S));
+}
+
 // Tests to verify getIntrinsicForClangBuiltin.
 TEST(IntrinsicNameLookup, ClangBuiltinLookup) {
   using namespace Intrinsic;
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 21606fff1f32b..25b390758f172 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -177,10 +177,10 @@ TEST(ELFObjectFileTest, MachineTestForPPC) {
 }
 
 TEST(ELFObjectFileTest, MachineTestForRISCV) {
-  std::array<StringRef, 4> Formats = {"elf32-littleriscv", "elf32-littleriscv",
-                                      "elf64-littleriscv", "elf64-littleriscv"};
-  std::array<Triple::ArchType, 4> Archs = {Triple::riscv32, Triple::riscv32,
-                                           Triple::riscv64, Triple::riscv64};
+  std::array<StringRef, 4> Formats = {"elf32-littleriscv", "elf32-bigriscv",
+                                      "elf64-littleriscv", "elf64-bigriscv"};
+  std::array<Triple::ArchType, 4> Archs = {Triple::riscv32, Triple::riscv32be,
+                                           Triple::riscv64, Triple::riscv64be};
   for (auto [Idx, Data] : enumerate(generateData(ELF::EM_RISCV)))
     checkFormatAndArch(Data, Formats[Idx], Archs[Idx]);
 }
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 9780eba5dc7a7..319538eaea135 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1129,6 +1129,7 @@ R"(All available -march extensions for RISC-V
     svnapot              1.0
     svpbmt               1.0
     svvptc               1.0
+    xandesbfhcvt         5.0
     xandesperf           5.0
     xandesvbfhcvt        5.0
     xandesvdot           5.0
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index aeeea0a391ba9..53a64b6474480 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -956,10 +956,6 @@ TEST(TargetParserTest, ARMparseArchVersion) {
 
 TEST(TargetParserTest, getARMCPUForArch) {
   // Platform specific defaults.
-  {
-    llvm::Triple Triple("arm--nacl");
-    EXPECT_EQ("cortex-a8", ARM::getARMCPUForArch(Triple));
-  }
   {
     llvm::Triple Triple("arm--openbsd");
     EXPECT_EQ("cortex-a8", ARM::getARMCPUForArch(Triple));
diff --git a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
index 0b00734fc4d75..1daf381ee2862 100644
--- a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
+++ b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
@@ -54,20 +54,13 @@ struct DebugInfoDrop : public FunctionPass {
 struct DebugValueDrop : public FunctionPass {
   static char ID;
   bool runOnFunction(Function &F) override {
-    SmallVector<DbgVariableIntrinsic *, 4> Dbgs;
     for (BasicBlock &BB : F) {
-      // Remove dbg var intrinsics.
       for (Instruction &I : BB) {
-        if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-          Dbgs.push_back(DVI);
-        // If there are any non-intrinsic records (DbgRecords), drop those too.
+        // If there are any debug records, drop them.
         I.dropDbgRecords();
       }
     }
 
-    for (auto &I : Dbgs)
-      I->eraseFromParent();
-
     return true;
   }
 
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 3c7a892c9d65a..dd2a6249c7cf9 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -633,62 +633,6 @@ TEST(Local, ChangeToUnreachable) {
   EXPECT_EQ(DLA, DLB);
 }
 
-TEST(Local, FindDbgUsers) {
-  LLVMContext Ctx;
-  std::unique_ptr<Module> M = parseIR(Ctx,
-                                      R"(
-  define dso_local void @fun(ptr %a) #0 !dbg !11 {
-  entry:
-      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
-    ret void
-  }
-
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2, !3, !9}
-  !llvm.ident = !{!10}
-
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 17.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-  !1 = !DIFile(filename: "test.cpp", directory: "/")
-  !2 = !{i32 7, !"Dwarf Version", i32 5}
-  !3 = !{i32 2, !"Debug Info Version", i32 3}
-  !4 = !{i32 1, !"wchar_size", i32 4}
-  !9 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
-  !10 = !{!"clang version 17.0.0"}
-  !11 = distinct !DISubprogram(name: "fun", linkageName: "fun", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !14)
-  !12 = !DISubroutineType(types: !13)
-  !13 = !{null}
-  !14 = !{}
-  !15 = distinct !DIAssignID()
-  !16 = !DILocalVariable(name: "x", scope: !11, file: !1, line: 2, type: !17)
-  !17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 64)
-  !18 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !19 = !DILocation(line: 0, scope: !11)
-  )");
-
-  bool BrokenDebugInfo = true;
-  verifyModule(*M, &errs(), &BrokenDebugInfo);
-  ASSERT_FALSE(BrokenDebugInfo);
-
-  // Convert to debug intrinsics as we want to test findDbgUsers and
-  // findDbgValue's debug-intrinsic-finding code here.
-  // TODO: Remove this test when debug intrinsics are removed.
-  M->convertFromNewDbgValues();
-
-  Function &Fun = *cast<Function>(M->getNamedValue("fun"));
-  Value *Arg = Fun.getArg(0);
-  SmallVector<DbgVariableIntrinsic *> Users;
-  // Arg (%a) is used twice by a single dbg.assign. Check findDbgUsers returns
-  // only 1 pointer to it rather than 2.
-  findDbgUsers(Users, Arg);
-  EXPECT_EQ(Users.size(), 1u);
-
-  SmallVector<DbgValueInst *> Vals;
-  // Arg (%a) is used twice by a single dbg.assign. Check findDbgValues returns
-  // only 1 pointer to it rather than 2.
-  findDbgValues(Vals, Arg);
-  EXPECT_EQ(Vals.size(), 1u);
-}
-
 TEST(Local, FindDbgRecords) {
   // DbgRecord copy of the FindDbgUsers test above.
   LLVMContext Ctx;
@@ -823,13 +767,13 @@ TEST(Local, ReplaceAllDbgUsesWith) {
 
   BasicBlock &BB = F.front();
   Instruction &A = BB.front();
-  Instruction &B = *A.getNextNonDebugInstruction();
-  Instruction &C = *B.getNextNonDebugInstruction();
-  Instruction &D = *C.getNextNonDebugInstruction();
-  Instruction &E = *D.getNextNonDebugInstruction();
-  Instruction &F_ = *E.getNextNonDebugInstruction();
-  Instruction &Barrier = *F_.getNextNonDebugInstruction();
-  Instruction &G = *Barrier.getNextNonDebugInstruction();
+  Instruction &B = *A.getNextNode();
+  Instruction &C = *B.getNextNode();
+  Instruction &D = *C.getNextNode();
+  Instruction &E = *D.getNextNode();
+  Instruction &F_ = *E.getNextNode();
+  Instruction &Barrier = *F_.getNextNode();
+  Instruction &G = *Barrier.getNextNode();
 
   // Simulate i32 <-> i64* conversion. Expect no updates: the datalayout says
   // pointers are 64 bits, so the conversion would be lossy.
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index 177eecebce9a5..f0e23690367db 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -106,8 +106,16 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
                               bool ExportEnums) {
   OS << "\n";
   OS << "enum class " << Enum << " {\n";
-  for (const Record *R : Records) {
-    OS << "  " << getIdentifierName(R, Prefix) << ",\n";
+  if (!Records.empty()) {
+    std::string N;
+    for (auto [I, R] : llvm::enumerate(Records)) {
+      N = getIdentifierName(R, Prefix);
+      OS << "  " << N << ",\n";
+      // Make the sentinel names less likely to conflict with actual names...
+      if (I == 0)
+        OS << "  First_ = " << N << ",\n";
+    }
+    OS << "  Last_ = " << N << ",\n";
   }
   OS << "};\n";
   OS << "\n";
@@ -282,6 +290,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   if (DirLang.hasEnableBitmaskEnumInNamespace())
     OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n";
 
+  OS << "#include \"llvm/ADT/Sequence.h\"\n";
   OS << "#include \"llvm/ADT/StringRef.h\"\n";
   OS << "#include \"llvm/Frontend/Directive/Spelling.h\"\n";
   OS << "#include \"llvm/Support/Compiler.h\"\n";
@@ -375,6 +384,15 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   for (auto Ns : reverse(Namespaces))
     OS << "} // namespace " << Ns << "\n";
 
+  // These specializations need to be in ::llvm.
+  for (StringRef Enum : {"Association", "Category", "Directive", "Clause"}) {
+    OS << "\n";
+    OS << "template <> struct enum_iteration_traits<"
+       << DirLang.getCppNamespace() << "::" << Enum << "> {\n";
+    OS << "  static constexpr bool is_iterable = true;\n";
+    OS << "};\n";
+  }
+
   OS << "} // namespace llvm\n";
 
   OS << "#endif // LLVM_" << Lang << "_INC\n";
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 68bd1b5c9cb21..652bea9dc7f65 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -8,10 +8,12 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
+#include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
@@ -305,8 +307,6 @@ void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
 
 void RuntimeLibcallEmitter::emitGetInitRuntimeLibcallNames(
     raw_ostream &OS) const {
-  // TODO: Emit libcall names as string offset table.
-
   OS << "const RTLIB::LibcallImpl "
         "llvm::RTLIB::RuntimeLibcallsInfo::"
         "DefaultLibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {\n";
@@ -331,17 +331,24 @@ void RuntimeLibcallEmitter::emitGetInitRuntimeLibcallNames(
         "};\n\n";
 
   // Emit the implementation names
-  OS << "const char *const llvm::RTLIB::RuntimeLibcallsInfo::"
-        "LibCallImplNames[RTLIB::NumLibcallImpls] = {\n"
-        "  nullptr, // RTLIB::Unsupported\n";
+  StringToOffsetTable Table(/*AppendZero=*/true,
+                            "RTLIB::RuntimeLibcallsInfo::");
+
+  for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+    Table.GetOrAddStringOffset(LibCallImpl.getLibcallFuncName());
+
+  Table.EmitStringTableDef(OS, "RuntimeLibcallImplNameTable");
+  OS << R"(
+const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
+)";
 
+  OS << formatv("  {}, // {}\n", Table.GetStringOffset(""),
+                ""); // Unsupported entry
   for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
-    OS << "  \"" << LibCallImpl.getLibcallFuncName() << "\", // ";
-    LibCallImpl.emitEnumEntry(OS);
-    OS << '\n';
+    StringRef ImplName = LibCallImpl.getLibcallFuncName();
+    OS << formatv("  {}, // {}\n", Table.GetStringOffset(ImplName), ImplName);
   }
-
-  OS << "};\n\n";
+  OS << "};\n";
 
   // Emit the reverse mapping from implementation libraries to RTLIB::Libcall
   OS << "const RTLIB::Libcall llvm::RTLIB::RuntimeLibcallsInfo::"
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index c43cc9afe1e3c..28b542f09e8c0 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -701,11 +701,13 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
   Orders.resize(1 + AltOrders->size());
 
   // Default allocation order always contains all registers.
+  MemberBV.resize(RegBank.getRegisters().size());
   Artificial = true;
   for (const Record *Element : *Elements) {
     Orders[0].push_back(Element);
     const CodeGenRegister *Reg = RegBank.getReg(Element);
     Members.push_back(Reg);
+    MemberBV.set(CodeGenRegBank::getRegIndex(Reg));
     Artificial &= Reg->Artificial;
     if (!Reg->getSuperRegs().empty())
       RegsWithSuperRegsTopoSigs.set(Reg->getTopoSig());
@@ -767,9 +769,11 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
       RegsWithSuperRegsTopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1),
       RSI(Props.RSI), CopyCost(0), Allocatable(true), AllocationPriority(0),
       GlobalPriority(false), TSFlags(0) {
+  MemberBV.resize(RegBank.getRegisters().size());
   Artificial = true;
   GeneratePressureSet = false;
   for (const auto R : Members) {
+    MemberBV.set(CodeGenRegBank::getRegIndex(R));
     if (!R->getSuperRegs().empty())
       RegsWithSuperRegsTopoSigs.set(R->getTopoSig());
     Artificial &= R->Artificial;
@@ -833,7 +837,7 @@ bool CodeGenRegisterClass::hasType(const ValueTypeByHwMode &VT) const {
 }
 
 bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const {
-  return llvm::binary_search(Members, Reg, deref<std::less<>>());
+  return MemberBV.test(CodeGenRegBank::getRegIndex(Reg));
 }
 
 unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank &RegBank) const {
@@ -2295,9 +2299,6 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) {
       SRSets[I].push_back(R);
   }
 
-  for (auto I : SRSets)
-    sortAndUniqueRegisters(I.second);
-
   // Find matching classes for all SRSets entries.  Iterate in SubRegIndex
   // numerical order to visit synthetic indices last.
   for (const CodeGenSubRegIndex &SubIdx : SubRegIndices) {
@@ -2332,8 +2333,7 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
     CodeGenRegisterClass *RC,
     std::list<CodeGenRegisterClass>::iterator FirstSubRegRC) {
   DenseSet<const CodeGenSubRegIndex *> ImpliedSubRegIndices;
-  std::vector<std::pair<const CodeGenRegister *, const CodeGenRegister *>>
-      SubToSuperRegs;
+  std::vector<const CodeGenRegister *> SubRegs;
   BitVector TopoSigs(getNumTopoSigs());
 
   // Iterate subregister indices in topological order to visit larger indices
@@ -2351,15 +2351,14 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
 
     // Build list of (Sub, Super) pairs for this SubIdx, sorted by Sub. Note
     // that the list may contain entries with the same Sub but different Supers.
-    SubToSuperRegs.clear();
+    SubRegs.clear();
     TopoSigs.reset();
     for (const CodeGenRegister *Super : RC->getMembers()) {
       const CodeGenRegister *Sub = Super->getSubRegs().find(SubIdx)->second;
       assert(Sub && "Missing sub-register");
-      SubToSuperRegs.emplace_back(Sub, Super);
+      SubRegs.push_back(Sub);
       TopoSigs.set(Sub->getTopoSig());
     }
-    sort(SubToSuperRegs, on_first<deref<std::less<>>>());
 
     // Iterate over sub-register class candidates.  Ignore classes created by
     // this loop. They will never be useful.
@@ -2374,16 +2373,10 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
       // Topological shortcut: SubRC members have the wrong shape.
       if (!TopoSigs.anyCommon(SubRC.getRegsWithSuperRegsTopoSigs()))
         continue;
-      // Compute the subset of RC that maps into SubRC with a single linear scan
-      // through SubToSuperRegs and the members of SubRC.
+      // Compute the subset of RC that maps into SubRC.
       CodeGenRegister::Vec SubSetVec;
-      auto SubI = SubRC.getMembers().begin(), SubE = SubRC.getMembers().end();
-      for (auto &[Sub, Super] : SubToSuperRegs) {
-        while (SubI != SubE && **SubI < *Sub)
-          ++SubI;
-        if (SubI == SubE)
-          break;
-        if (**SubI == *Sub)
+      for (const auto &[Sub, Super] : zip_equal(SubRegs, RC->getMembers())) {
+        if (SubRC.contains(Sub))
           SubSetVec.push_back(Super);
       }
 
@@ -2391,7 +2384,6 @@ void CodeGenRegBank::inferMatchingSuperRegClass(
         continue;
 
       // RC injects completely into SubRC.
-      sortAndUniqueRegisters(SubSetVec);
       if (SubSetVec.size() == RC->getMembers().size()) {
         SubRC.addSuperRegClass(SubIdx, RC);
 
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index bbcd44ce2cc5b..5e6fff0f775ea 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -315,6 +315,8 @@ inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) {
 
 class CodeGenRegisterClass {
   CodeGenRegister::Vec Members;
+  // Bit mask of members, indexed by getRegIndex.
+  BitVector MemberBV;
   // Allocation orders. Order[0] always contains all registers in Members.
   std::vector<SmallVector<const Record *, 16>> Orders;
   // Bit mask of sub-classes including this, indexed by their EnumValue.
@@ -752,7 +754,7 @@ class CodeGenRegBank {
   CodeGenRegister *getReg(const Record *);
 
   // Get a Register's index into the Registers array.
-  unsigned getRegIndex(const CodeGenRegister *Reg) const {
+  static unsigned getRegIndex(const CodeGenRegister *Reg) {
     return Reg->EnumValue - 1;
   }
 
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index d8626a0ce9e43..afc892b068582 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -171,10 +171,6 @@ bool CompressInstEmitter::validateTypes(const Record *DagOpType,
                                         bool IsSourceInst) {
   if (DagOpType == InstOpType)
     return true;
-  // Only source instruction operands are allowed to not match Input Dag
-  // operands.
-  if (!IsSourceInst)
-    return false;
 
   if (DagOpType->isSubClassOf("RegisterClass") &&
       InstOpType->isSubClassOf("RegisterClass")) {
@@ -258,9 +254,9 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
           continue;
         }
         // Validate that Dag operand type matches the type defined in the
-        // corresponding instruction. Operands in the input Dag pattern are
-        // allowed to be a subclass of the type specified in corresponding
-        // instruction operand instead of being an exact match.
+        // corresponding instruction. Operands in the input and output Dag
+        // patterns are allowed to be a subclass of the type specified in the
+        // corresponding instruction operand instead of being an exact match.
         if (!validateTypes(DI->getDef(), OpndRec, IsSourceInst))
           PrintFatalError(Rec->getLoc(),
                           "Error in Dag '" + Dag->getAsString() +
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index f523fc26a6671..21c611e364a8d 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -227,14 +227,14 @@ class DecoderEmitter {
   // Emit the decoder state machine table. Returns a mask of MCD decoder ops
   // that were emitted.
   unsigned emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
-                     indent Indent, unsigned BitWidth, StringRef Namespace,
+                     unsigned BitWidth, StringRef Namespace,
                      const EncodingIDsVec &EncodingIDs) const;
   void emitInstrLenTable(formatted_raw_ostream &OS,
                          ArrayRef<unsigned> InstrLen) const;
   void emitPredicateFunction(formatted_raw_ostream &OS,
-                             PredicateSet &Predicates, indent Indent) const;
-  void emitDecoderFunction(formatted_raw_ostream &OS, DecoderSet &Decoders,
-                           indent Indent) const;
+                             PredicateSet &Predicates) const;
+  void emitDecoderFunction(formatted_raw_ostream &OS,
+                           DecoderSet &Decoders) const;
 
   // run - Output the code emitter
   void run(raw_ostream &o);
@@ -833,8 +833,8 @@ unsigned Filter::usefulness() const {
 // Emit the decoder state machine table. Returns a mask of MCD decoder ops
 // that were emitted.
 unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
-                                   DecoderTable &Table, indent Indent,
-                                   unsigned BitWidth, StringRef Namespace,
+                                   DecoderTable &Table, unsigned BitWidth,
+                                   StringRef Namespace,
                                    const EncodingIDsVec &EncodingIDs) const {
   // We'll need to be able to map from a decoded opcode into the corresponding
   // EncodingID for this specific combination of BitWidth and Namespace. This
@@ -844,11 +844,9 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
   for (const auto &EI : EncodingIDs)
     OpcodeToEncodingID[EI.Opcode] = EI.EncodingID;
 
-  OS << Indent << "static const uint8_t DecoderTable" << Namespace << BitWidth
+  OS << "static const uint8_t DecoderTable" << Namespace << BitWidth
      << "[] = {\n";
 
-  Indent += 2;
-
   // Emit ULEB128 encoded value to OS, returning the number of bytes emitted.
   auto emitULEB128 = [](DecoderTable::const_iterator &I,
                         formatted_raw_ostream &OS) {
@@ -905,7 +903,7 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
       PrintFatalError("Invalid decode table opcode: " + Twine((int)DecoderOp) +
                       " at index " + Twine(Pos));
     case MCD::OPC_ExtractField: {
-      OS << Indent << "MCD::OPC_ExtractField, ";
+      OS << "  MCD::OPC_ExtractField, ";
 
       // ULEB128 encoded start value.
       const char *ErrMsg = nullptr;
@@ -923,7 +921,7 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
     case MCD::OPC_FilterValue:
     case MCD::OPC_FilterValueOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_FilterValueOrFail;
-      OS << Indent << "MCD::OPC_FilterValue" << (IsFail ? "OrFail, " : ", ");
+      OS << "  MCD::OPC_FilterValue" << (IsFail ? "OrFail, " : ", ");
       // The filter value is ULEB128 encoded.
       emitULEB128(I, OS);
 
@@ -937,7 +935,7 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
     case MCD::OPC_CheckField:
     case MCD::OPC_CheckFieldOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_CheckFieldOrFail;
-      OS << Indent << "MCD::OPC_CheckField" << (IsFail ? "OrFail, " : ", ");
+      OS << "  MCD::OPC_CheckField" << (IsFail ? "OrFail, " : ", ");
       // ULEB128 encoded start value.
       emitULEB128(I, OS);
       // 8-bit length.
@@ -957,7 +955,7 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
     case MCD::OPC_CheckPredicateOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_CheckPredicateOrFail;
 
-      OS << Indent << "MCD::OPC_CheckPredicate" << (IsFail ? "OrFail, " : ", ");
+      OS << "  MCD::OPC_CheckPredicate" << (IsFail ? "OrFail, " : ", ");
       emitULEB128(I, OS);
 
       if (!IsFail) {
@@ -977,7 +975,7 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
       unsigned Opc = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
       assert(ErrMsg == nullptr && "ULEB128 value too large!");
 
-      OS << Indent << "MCD::OPC_" << (IsTry ? "Try" : "") << "Decode"
+      OS << "  MCD::OPC_" << (IsTry ? "Try" : "") << "Decode"
          << (IsFail ? "OrFail, " : ", ");
       emitULEB128(I, OS);
 
@@ -1007,7 +1005,7 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
       break;
     }
     case MCD::OPC_SoftFail: {
-      OS << Indent << "MCD::OPC_SoftFail, ";
+      OS << "  MCD::OPC_SoftFail, ";
       // Decode the positive mask.
       const char *ErrMsg = nullptr;
       uint64_t PositiveMask = decodeULEB128(&*I, nullptr, EndPtr, &ErrMsg);
@@ -1026,15 +1024,12 @@ unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
       break;
     }
     case MCD::OPC_Fail:
-      OS << Indent << "MCD::OPC_Fail,\n";
+      OS << "  MCD::OPC_Fail,\n";
       break;
     }
   }
-  OS << Indent << "0\n";
-
-  Indent -= 2;
-
-  OS << Indent << "};\n\n";
+  OS << "  0\n";
+  OS << "};\n\n";
 
   return OpcodeMask;
 }
@@ -1048,27 +1043,23 @@ void DecoderEmitter::emitInstrLenTable(formatted_raw_ostream &OS,
 }
 
 void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
-                                           PredicateSet &Predicates,
-                                           indent Indent) const {
+                                           PredicateSet &Predicates) const {
   // The predicate function is just a big switch statement based on the
   // input predicate index.
-  OS << Indent << "static bool checkDecoderPredicate(unsigned Idx, "
-     << "const FeatureBitset &Bits) {\n";
-  Indent += 2;
-  OS << Indent << "switch (Idx) {\n";
-  OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
+  OS << "static bool checkDecoderPredicate(unsigned Idx, const FeatureBitset "
+        "&Bits) {\n";
+  OS << "  switch (Idx) {\n";
+  OS << "  default: llvm_unreachable(\"Invalid index!\");\n";
   for (const auto &[Index, Predicate] : enumerate(Predicates)) {
-    OS << Indent << "case " << Index << ":\n";
-    OS << Indent + 2 << "return (" << Predicate << ");\n";
+    OS << "  case " << Index << ":\n";
+    OS << "    return (" << Predicate << ");\n";
   }
-  OS << Indent << "}\n";
-  Indent -= 2;
-  OS << Indent << "}\n\n";
+  OS << "  }\n";
+  OS << "}\n\n";
 }
 
 void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
-                                         DecoderSet &Decoders,
-                                         indent Indent) const {
+                                         DecoderSet &Decoders) const {
   // The decoder function is just a big switch statement or a table of function
   // pointers based on the input decoder index.
 
@@ -1085,53 +1076,46 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS,
   if (UseFnTableInDecodeToMCInst) {
     // Emit a function for each case first.
     for (const auto &[Index, Decoder] : enumerate(Decoders)) {
-      OS << Indent << "template <typename InsnType>\n";
-      OS << Indent << "DecodeStatus decodeFn" << Index << "(" << DecodeParams
-         << ") {\n";
-      Indent += 2;
-      OS << Indent << TmpTypeDecl;
-      OS << Indent << "[[maybe_unused]] TmpType tmp;\n";
+      OS << "template <typename InsnType>\n";
+      OS << "DecodeStatus decodeFn" << Index << "(" << DecodeParams << ") {\n";
+      OS << "  " << TmpTypeDecl;
+      OS << "  [[maybe_unused]] TmpType tmp;\n";
       OS << Decoder;
-      OS << Indent << "return S;\n";
-      Indent -= 2;
-      OS << Indent << "}\n\n";
+      OS << "  return S;\n";
+      OS << "}\n\n";
     }
   }
 
-  OS << Indent << "// Handling " << Decoders.size() << " cases.\n";
-  OS << Indent << "template <typename InsnType>\n";
-  OS << Indent << "static DecodeStatus decodeToMCInst(unsigned Idx, "
-     << DecodeParams << ") {\n";
-  Indent += 2;
-  OS << Indent << "DecodeComplete = true;\n";
+  OS << "// Handling " << Decoders.size() << " cases.\n";
+  OS << "template <typename InsnType>\n";
+  OS << "static DecodeStatus decodeToMCInst(unsigned Idx, " << DecodeParams
+     << ") {\n";
+  OS << "  DecodeComplete = true;\n";
 
   if (UseFnTableInDecodeToMCInst) {
     // Build a table of function pointers.
-    OS << Indent << "using DecodeFnTy = DecodeStatus (*)(" << DecodeParams
-       << ");\n";
-    OS << Indent << "static constexpr DecodeFnTy decodeFnTable[] = {\n";
+    OS << "  using DecodeFnTy = DecodeStatus (*)(" << DecodeParams << ");\n";
+    OS << "  static constexpr DecodeFnTy decodeFnTable[] = {\n";
     for (size_t Index : llvm::seq(Decoders.size()))
-      OS << Indent + 2 << "decodeFn" << Index << ",\n";
-    OS << Indent << "};\n";
-    OS << Indent << "if (Idx >= " << Decoders.size() << ")\n";
-    OS << Indent + 2 << "llvm_unreachable(\"Invalid index!\");\n";
-    OS << Indent
-       << "return decodeFnTable[Idx](S, insn, MI, Address, Decoder, "
+      OS << "    decodeFn" << Index << ",\n";
+    OS << "  };\n";
+    OS << "  if (Idx >= " << Decoders.size() << ")\n";
+    OS << "    llvm_unreachable(\"Invalid index!\");\n";
+    OS << "  return decodeFnTable[Idx](S, insn, MI, Address, Decoder, "
           "DecodeComplete);\n";
   } else {
-    OS << Indent << TmpTypeDecl;
-    OS << Indent << "TmpType tmp;\n";
-    OS << Indent << "switch (Idx) {\n";
-    OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
+    OS << "  " << TmpTypeDecl;
+    OS << "  TmpType tmp;\n";
+    OS << "  switch (Idx) {\n";
+    OS << "  default: llvm_unreachable(\"Invalid index!\");\n";
     for (const auto &[Index, Decoder] : enumerate(Decoders)) {
-      OS << Indent << "case " << Index << ":\n";
+      OS << "  case " << Index << ":\n";
       OS << Decoder;
-      OS << Indent + 2 << "return S;\n";
+      OS << "    return S;\n";
     }
-    OS << Indent << "}\n";
+    OS << "  }\n";
   }
-  Indent -= 2;
-  OS << Indent << "}\n";
+  OS << "}\n";
 }
 
 // Populates the field of the insn given the start position and the number of
@@ -2673,7 +2657,7 @@ namespace {
     TableInfo.Table.push_back(MCD::OPC_Fail);
 
     // Print the table to the output stream.
-    OpcodeMask |= emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(),
+    OpcodeMask |= emitTable(OS, TableInfo.Table, FC.getBitWidth(),
                             DecoderNamespace, EncodingIDs);
   }
 
@@ -2689,10 +2673,10 @@ namespace {
 
   // Emit the predicate function.
   if (HasCheckPredicate)
-    emitPredicateFunction(OS, TableInfo.Predicates, indent(0));
+    emitPredicateFunction(OS, TableInfo.Predicates);
 
   // Emit the decoder function.
-  emitDecoderFunction(OS, TableInfo.Decoders, indent(0));
+  emitDecoderFunction(OS, TableInfo.Decoders);
 
   // Emit the main entry point for the decoder, decodeInstruction().
   emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 7d24c0f80cddb..2a311b7ff96b8 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1644,7 +1644,7 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) {
       for (const CodeGenRegister &Reg : Regs) {
         const CodeGenRegisterClass *BaseRC = nullptr;
         for (const CodeGenRegisterClass *RC : BaseClasses) {
-          if (is_contained(RC->getMembers(), &Reg)) {
+          if (RC->contains(&Reg)) {
             BaseRC = RC;
             break;
           }
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index e24429a9167bd..bdb5a1e8d9b74 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -56,8 +56,8 @@ static_library("Basic") {
   ]
   deps = [
     ":write_vcsversion",
-    "//clang/include/clang/Basic:AttributeSpellingList",
     "//clang/include/clang/Basic:AttrHasAttributeImpl",
+    "//clang/include/clang/Basic:AttributeSpellingList",
     "//clang/include/clang/Basic:CXX11AttributeInfo",
     "//clang/include/clang/Basic:DiagnosticAllCompatIDs",
     "//clang/include/clang/Basic:arm_fp16",
@@ -128,7 +128,6 @@ static_library("Basic") {
     "Targets/Mips.cpp",
     "Targets/NVPTX.cpp",
     "Targets/OSTargets.cpp",
-    "Targets/PNaCl.cpp",
     "Targets/PPC.cpp",
     "Targets/RISCV.cpp",
     "Targets/SPIR.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
index f21b05d031fd7..e36ca4005c657 100644
--- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
@@ -133,7 +133,6 @@ static_library("CodeGen") {
     "Targets/MSP430.cpp",
     "Targets/Mips.cpp",
     "Targets/NVPTX.cpp",
-    "Targets/PNaCl.cpp",
     "Targets/PPC.cpp",
     "Targets/RISCV.cpp",
     "Targets/SPIR.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
index 7c05c77d9753a..e1f31179b9a20 100644
--- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
@@ -87,7 +87,6 @@ static_library("Driver") {
     "ToolChains/Managarm.cpp",
     "ToolChains/MinGW.cpp",
     "ToolChains/MipsLinux.cpp",
-    "ToolChains/NaCl.cpp",
     "ToolChains/NetBSD.cpp",
     "ToolChains/OHOS.cpp",
     "ToolChains/OpenBSD.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 0220ced1f63ac..4eab61b6e9ce2 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -205,6 +205,7 @@ copy("Headers") {
     "cmpccxaddintrin.h",
     "cpuid.h",
     "crc32intrin.h",
+    "cuda_wrappers/__utility/declval.h",
     "cuda_wrappers/algorithm",
     "cuda_wrappers/bits/basic_string.h",
     "cuda_wrappers/bits/shared_ptr_base.h",
@@ -304,6 +305,7 @@ copy("Headers") {
     "riscv_bitmanip.h",
     "riscv_corev_alu.h",
     "riscv_crypto.h",
+    "riscv_nds.h",
     "riscv_ntlh.h",
     "rtmintrin.h",
     "s390intrin.h",
diff --git a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
index 8f7beea152ab7..30b8bb61184bd 100644
--- a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
@@ -87,6 +87,7 @@ shared_library("libclang") {
     "Index_Internal.h",
     "Indexing.cpp",
     "Rewrite.cpp",
+    "Obsolete.cpp",
   ]
   if (host_os == "mac") {
     ldflags = [
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 6051674a790e8..82ec8121548c9 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -892,6 +892,7 @@ if (current_toolchain == default_toolchain) {
       "__cxx03/__utility/unreachable.h",
       "__cxx03/__variant/monostate.h",
       "__cxx03/__verbose_abort",
+      "__cxx03/__verbose_trap",
       "__cxx03/algorithm",
       "__cxx03/array",
       "__cxx03/atomic",
@@ -1060,6 +1061,7 @@ if (current_toolchain == default_toolchain) {
       "__format/indic_conjunct_break_table.h",
       "__format/parser_std_format_spec.h",
       "__format/range_default_formatter.h",
+      "__format/range_format.h",
       "__format/range_formatter.h",
       "__format/unicode.h",
       "__format/width_estimation_table.h",
@@ -1171,7 +1173,6 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/check_grouping.h",
       "__locale_dir/get_c_locale.h",
       "__locale_dir/locale_base_api.h",
-      "__locale_dir/locale_base_api/android.h",
       "__locale_dir/locale_base_api/bsd_locale_fallbacks.h",
       "__locale_dir/locale_base_api/ibm.h",
       "__locale_dir/locale_base_api/musl.h",
@@ -1601,6 +1602,7 @@ if (current_toolchain == default_toolchain) {
       "__vector/vector_bool.h",
       "__vector/vector_bool_formatter.h",
       "__verbose_abort",
+      "__verbose_trap",
       "algorithm",
       "any",
       "array",
diff --git a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
index db608e3cc7449..b118d16441960 100644
--- a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn
@@ -37,6 +37,7 @@ static_library("MachO") {
     "ICF.cpp",
     "InputFiles.cpp",
     "InputSection.cpp",
+    "LinkerOptimizationHints.cpp",
     "LTO.cpp",
     "MapFile.cpp",
     "MarkLive.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index 3a7508ab7187e..b6b8f2f64caf8 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -41,11 +41,11 @@ static_library("CPlusPlus") {
     "CxxStringTypes.cpp",
     "Generic.cpp",
     "GenericBitset.cpp",
+    "GenericList.cpp",
     "GenericOptional.cpp",
     "LibCxx.cpp",
     "LibCxxAtomic.cpp",
     "LibCxxInitializerList.cpp",
-    "LibCxxList.cpp",
     "LibCxxMap.cpp",
     "LibCxxProxyArray.cpp",
     "LibCxxQueue.cpp",
@@ -63,5 +63,7 @@ static_library("CPlusPlus") {
     "MSVCUndecoratedNameParser.cpp",
     "MsvcStl.cpp",
     "MsvcStlSmartPointer.cpp",
+    "MsvcStlTuple.cpp",
+    "MsvcStlVector.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
index 87c02122e0f63..2d1a42890cd93 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
@@ -68,7 +68,6 @@ static_library("MCTargetDesc") {
     "MipsMCAsmInfo.cpp",
     "MipsMCCodeEmitter.cpp",
     "MipsMCTargetDesc.cpp",
-    "MipsNaClELFStreamer.cpp",
     "MipsOptionRecord.cpp",
     "MipsTargetStreamer.cpp",
     "MipsWinCOFFObjectWriter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn
index 715baee1e3a09..d0d2454beaa48 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BinaryFormat/BUILD.gn
@@ -12,6 +12,7 @@ unittest("BinaryFormatTests") {
     "MsgPackDocumentTest.cpp",
     "MsgPackReaderTest.cpp",
     "MsgPackWriterTest.cpp",
+    "SFrameTest.cpp",
     "TestFileMagic.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index 2b1a9076afe4a..83afb93f77ca7 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
-llvm_version_major = 21
+llvm_version_major = 22
 llvm_version_minor = 0
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index b5aa8edc03dc7..aeab06cab453b 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (21, 0, 0)
+__versioninfo__ = (22, 0, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/llvm/utils/llvm-original-di-preservation.py b/llvm/utils/llvm-original-di-preservation.py
index 03793b1136f8d..b5ccd7a3224f8 100755
--- a/llvm/utils/llvm-original-di-preservation.py
+++ b/llvm/utils/llvm-original-di-preservation.py
@@ -11,7 +11,6 @@
 from collections import defaultdict
 from collections import OrderedDict
 
-
 class DILocBug:
     def __init__(self, origin, action, bb_name, fn_name, instr):
         self.origin = origin
@@ -20,18 +19,35 @@ def __init__(self, origin, action, bb_name, fn_name, instr):
         self.fn_name = fn_name
         self.instr = instr
 
-    def __str__(self):
+    def key(self):
         return self.action + self.bb_name + self.fn_name + self.instr
 
+    def to_dict(self):
+        result = {
+            "instr": self.instr,
+            "fn_name": self.fn_name,
+            "bb_name": self.bb_name,
+            "action": self.action,
+        }
+        if self.origin:
+            result["origin"] = self.origin
+        return result
+
 
 class DISPBug:
     def __init__(self, action, fn_name):
         self.action = action
         self.fn_name = fn_name
 
-    def __str__(self):
+    def key(self):
         return self.action + self.fn_name
 
+    def to_dict(self):
+        return {
+            "fn_name": self.fn_name,
+            "action": self.action,
+        }
+
 
 class DIVarBug:
     def __init__(self, action, name, fn_name):
@@ -39,9 +55,41 @@ def __init__(self, action, name, fn_name):
         self.name = name
         self.fn_name = fn_name
 
-    def __str__(self):
+    def key(self):
         return self.action + self.name + self.fn_name
 
+    def to_dict(self):
+        return {
+            "fn_name": self.fn_name,
+            "name": self.name,
+            "action": self.action,
+        }
+
+
+def print_bugs_yaml(name, bugs_dict, indent=2):
+    def get_bug_line(indent_level: int, text: str, margin_mark: bool = False):
+        if margin_mark:
+            return "- ".rjust(indent_level * indent) + text
+        return " " * indent * indent_level + text
+
+    print(f"{name}:")
+    for bugs_file, bugs_pass_dict in sorted(iter(bugs_dict.items())):
+        print(get_bug_line(1, f"{bugs_file}:"))
+        for bugs_pass, bugs_list in sorted(iter(bugs_pass_dict.items())):
+            print(get_bug_line(2, f"{bugs_pass}:"))
+            for bug in bugs_list:
+                bug_dict = bug.to_dict()
+                first_line = True
+                # First item needs a '-' in the margin.
+                for key, val in sorted(iter(bug_dict.items())):
+                    if "\n" in val:
+                        # Output block text for any multiline string.
+                        print(get_bug_line(3, f"{key}: |", first_line))
+                        for line in val.splitlines():
+                            print(get_bug_line(4, line))
+                    else:
+                        print(get_bug_line(3, f"{key}: {val}", first_line))
+                    first_line = False
 
 # Report the bugs in form of html.
 def generate_html_report(
@@ -430,9 +478,16 @@ def get_json_chunk(file, start, size):
 # Parse the program arguments.
 def parse_program_args(parser):
     parser.add_argument("file_name", type=str, help="json file to process")
-    parser.add_argument("html_file", type=str, help="html file to output data")
-    parser.add_argument(
-        "-compress", action="store_true", help="create reduced html report"
+    parser.add_argument("--reduce", action="store_true", help="create reduced report")
+
+    report_type_group = parser.add_mutually_exclusive_group(required=True)
+    report_type_group.add_argument(
+        "--report-html-file", type=str, help="output HTML file for the generated report"
+    )
+    report_type_group.add_argument(
+        "--acceptance-test",
+        action="store_true",
+        help="if set, produce terminal-friendly output and return 0 iff the input file is empty or does not exist",
     )
 
     return parser.parse_args()
@@ -442,10 +497,22 @@ def Main():
     parser = argparse.ArgumentParser()
     opts = parse_program_args(parser)
 
-    if not opts.html_file.endswith(".html"):
+    if opts.report_html_file is not None and not opts.report_html_file.endswith(
+        ".html"
+    ):
         print("error: The output file must be '.html'.")
         sys.exit(1)
 
+    if opts.acceptance_test:
+        if os.path.isdir(opts.file_name):
+            print(f"error: Directory passed as input file: '{opts.file_name}'")
+            sys.exit(1)
+        if not os.path.exists(opts.file_name):
+            # We treat an empty input file as a success, as debugify will generate an output file iff any errors are
+            # found, meaning we expect 0 errors to mean that the expected file does not exist.
+            print(f"No errors detected for: {opts.file_name}")
+            sys.exit(0)
+
     # Use the defaultdict in order to make multidim dicts.
     di_location_bugs = defaultdict(lambda: defaultdict(list))
     di_subprogram_bugs = defaultdict(lambda: defaultdict(list))
@@ -489,9 +556,9 @@ def Main():
                 skipped_lines += 1
                 continue
 
-            di_loc_bugs = di_location_bugs[bugs_file][bugs_pass]
-            di_sp_bugs = di_subprogram_bugs[bugs_file][bugs_pass]
-            di_var_bugs = di_variable_bugs[bugs_file][bugs_pass]
+            di_loc_bugs = di_location_bugs.get("bugs_file", {}).get("bugs_pass", [])
+            di_sp_bugs = di_subprogram_bugs.get("bugs_file", {}).get("bugs_pass", [])
+            di_var_bugs = di_variable_bugs.get("bugs_file", {}).get("bugs_pass", [])
 
             # Omit duplicated bugs.
             di_loc_set = set()
@@ -515,9 +582,9 @@ def Main():
                         skipped_bugs += 1
                         continue
                     di_loc_bug = DILocBug(origin, action, bb_name, fn_name, instr)
-                    if not str(di_loc_bug) in di_loc_set:
-                        di_loc_set.add(str(di_loc_bug))
-                        if opts.compress:
+                    if not di_loc_bug.key() in di_loc_set:
+                        di_loc_set.add(di_loc_bug.key())
+                        if opts.reduce:
                             pass_instr = bugs_pass + instr
                             if not pass_instr in di_loc_pass_instr_set:
                                 di_loc_pass_instr_set.add(pass_instr)
@@ -538,9 +605,9 @@ def Main():
                         skipped_bugs += 1
                         continue
                     di_sp_bug = DISPBug(action, name)
-                    if not str(di_sp_bug) in di_sp_set:
-                        di_sp_set.add(str(di_sp_bug))
-                        if opts.compress:
+                    if not di_sp_bug.key() in di_sp_set:
+                        di_sp_set.add(di_sp_bug.key())
+                        if opts.reduce:
                             pass_fn = bugs_pass + name
                             if not pass_fn in di_sp_pass_fn_set:
                                 di_sp_pass_fn_set.add(pass_fn)
@@ -562,9 +629,9 @@ def Main():
                         skipped_bugs += 1
                         continue
                     di_var_bug = DIVarBug(action, name, fn_name)
-                    if not str(di_var_bug) in di_var_set:
-                        di_var_set.add(str(di_var_bug))
-                        if opts.compress:
+                    if not di_var_bug.key() in di_var_set:
+                        di_var_set.add(di_var_bug.key())
+                        if opts.reduce:
                             pass_var = bugs_pass + name
                             if not pass_var in di_var_pass_var_set:
                                 di_var_pass_var_set.add(pass_var)
@@ -582,19 +649,40 @@ def Main():
                     skipped_bugs += 1
                     continue
 
-            di_location_bugs[bugs_file][bugs_pass] = di_loc_bugs
-            di_subprogram_bugs[bugs_file][bugs_pass] = di_sp_bugs
-            di_variable_bugs[bugs_file][bugs_pass] = di_var_bugs
-
-    generate_html_report(
-        di_location_bugs,
-        di_subprogram_bugs,
-        di_variable_bugs,
-        di_location_bugs_summary,
-        di_sp_bugs_summary,
-        di_var_bugs_summary,
-        opts.html_file,
-    )
+            if di_loc_bugs:
+                di_location_bugs[bugs_file][bugs_pass] = di_loc_bugs
+            if di_sp_bugs:
+                di_subprogram_bugs[bugs_file][bugs_pass] = di_sp_bugs
+            if di_var_bugs:
+                di_variable_bugs[bugs_file][bugs_pass] = di_var_bugs
+
+    if opts.report_html_file is not None:
+        generate_html_report(
+            di_location_bugs,
+            di_subprogram_bugs,
+            di_variable_bugs,
+            di_location_bugs_summary,
+            di_sp_bugs_summary,
+            di_var_bugs_summary,
+            opts.report_html_file,
+        )
+    else:
+        # Pretty(ish) print the detected bugs, but check if any exist first so that we don't print an empty dict.
+        if di_location_bugs:
+            print_bugs_yaml("DILocation Bugs", di_location_bugs)
+        if di_subprogram_bugs:
+            print_bugs_yaml("DISubprogram Bugs", di_subprogram_bugs)
+        if di_variable_bugs:
+            print_bugs_yaml("DIVariable Bugs", di_variable_bugs)
+
+    if opts.acceptance_test:
+        if any((di_location_bugs, di_subprogram_bugs, di_variable_bugs)):
+            # Add a newline gap after printing at least one error.
+            print()
+            print(f"Errors detected for: {opts.file_name}")
+            sys.exit(1)
+        else:
+            print(f"No errors detected for: {opts.file_name}")
 
     if skipped_lines > 0:
         print("Skipped lines: " + str(skipped_lines))
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
index d3369abae70b9..0cfba16bfcf25 100644
--- a/llvm/utils/mlgo-utils/mlgo/__init__.py
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -4,7 +4,7 @@
 
 from datetime import timezone, datetime
 
-__versioninfo__ = (20, 0, 0)
+__versioninfo__ = (22, 0, 0)
 __version__ = (
     ".".join(str(v) for v in __versioninfo__)
     + "dev"
diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 66bef82586a34..0ac392cbed7be 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -123,7 +123,7 @@ export_sources() {
                 tar -C test-suite-$release$rc.src --strip-components=1 -xzf -
         fi
         echo "Creating tarball for test-suite ..."
-        tar --sort=name --owner=0 --group=0 \
+        XZ_OPT="-T0" tar --sort=name --owner=0 --group=0 \
             --pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
             -cJf test-suite-$release$rc.src.tar.xz test-suite-$release$rc.src
     fi
diff --git a/llvm/utils/remote-exec.py b/llvm/utils/remote-exec.py
index 8512b264f5d1c..4d646c5d3e282 100644
--- a/llvm/utils/remote-exec.py
+++ b/llvm/utils/remote-exec.py
@@ -165,4 +165,13 @@ def main():
 
 
 if __name__ == "__main__":
-    exit(main())
+    rc = main()
+
+    # If the remote process died with a signal, this will be exposed by ssh as
+    # an exit code in this range. We may be running under `not --crash` which
+    # will expect us to also die with a signal, so send the signal to ourselves
+    # so that wait4() in `not` will detect the signal. 
+    if rc > 128 and rc < 160:
+        os.kill(os.getpid(), rc - 128)
+
+    exit(rc)
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index b3bde055f04f0..f988bebea1223 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -89,7 +89,7 @@ their semantics via a special [TableGen backend][TableGenBackend]:
     help of the following constructs.
 *   The `Dialect` class: Operations belonging to one logical group are placed in
     the same dialect. The `Dialect` class contains dialect-level information.
-*   The `OpTrait` class hierarchy: They are used to specify special properties
+*   The `Trait` class hierarchy: They are used to specify special properties
     and constraints of the operation, including whether the operation has side
     effect or whether its output has the same shape as the input.
 *   The `ins`/`outs` marker: These are two special markers builtin to the
@@ -306,6 +306,8 @@ Right now, the following primitive constraints are supported:
 *   `IntPositive`: Specifying an integer attribute whose value is positive
 *   `IntNonNegative`: Specifying an integer attribute whose value is
     non-negative
+*   `IntPowerOf2`: Specifying an integer attribute whose value is a power of
+    two > 0
 *   `ArrayMinCount<N>`: Specifying an array attribute to have at least `N`
     elements
 *   `ArrayMaxCount<N>`: Specifying an array attribute to have at most `N`
@@ -434,7 +436,7 @@ various traits in the `mlir::OpTrait` namespace.
 Both operation traits, [interfaces](../Interfaces.md/#utilizing-the-ods-framework),
 and constraints involving multiple operands/attributes/results are provided as
 the third template parameter to the `Op` class. They should be deriving from
-the `OpTrait` class. See [Constraints](#constraints) for more information.
+the `Trait` class. See [Constraints](#constraints) for more information.
 
 ### Builder methods
 
@@ -1353,7 +1355,7 @@ results. These constraints should be specified as the `Op` class template
 parameter as described in
 [Operation traits and constraints](#operation-traits-and-constraints).
 
-Multi-entity constraints are modeled as `PredOpTrait` (a subclass of `OpTrait`)
+Multi-entity constraints are modeled as `PredOpTrait` (a subclass of `Trait`)
 in [`OpBase.td`][OpBase].A bunch of constraint primitives are provided to help
 specification. See [`OpBase.td`][OpBase] for the complete list.
 
@@ -1364,7 +1366,7 @@ commutative or not, whether is a terminator, etc. These constraints should be
 specified as the `Op` class template parameter as described in
 [Operation traits and constraints](#operation-traits-and-constraints).
 
-Traits are modeled as `NativeOpTrait` (a subclass of `OpTrait`) in
+Traits are modeled as `NativeTrait` (a subclass of `Trait`) in
 [`OpBase.td`][OpBase]. They are backed and will be translated into the
 corresponding C++ `mlir::OpTrait` classes.
 
diff --git a/mlir/include/mlir-c/Pass.h b/mlir/include/mlir-c/Pass.h
index 8fd8e9956a65a..0d2e19ee7fb0a 100644
--- a/mlir/include/mlir-c/Pass.h
+++ b/mlir/include/mlir-c/Pass.h
@@ -88,6 +88,10 @@ MLIR_CAPI_EXPORTED void mlirPassManagerEnableIRPrinting(
 MLIR_CAPI_EXPORTED void
 mlirPassManagerEnableVerifier(MlirPassManager passManager, bool enable);
 
+/// Enable pass timing.
+MLIR_CAPI_EXPORTED void
+mlirPassManagerEnableTiming(MlirPassManager passManager);
+
 /// Nest an OpPassManager under the top-level PassManager, the nested
 /// passmanager will only run on operations matching the provided name.
 /// The returned OpPassManager will be destroyed when the parent is destroyed.
diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index b68262f09f485..ee401cca8f552 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -707,6 +707,19 @@ class IntegerRelation {
   /// this for uniformity with `applyDomain`.
   void applyRange(const IntegerRelation &rel);
 
+  /// Let the relation `this` be R1, and the relation `rel` be R2. Requires
+  /// R1 and R2 to have the same domain.
+  ///
+  /// Let R3 be the rangeProduct of R1 and R2. Then x R3 (y, z) iff
+  /// (x R1 y and x R2 z).
+  ///
+  /// Example:
+  ///
+  /// R1: (i, j) -> k : f(i, j, k) = 0
+  /// R2: (i, j) -> l : g(i, j, l) = 0
+  /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0
+  IntegerRelation rangeProduct(const IntegerRelation &rel);
+
   /// Given a relation `other: (A -> B)`, this operation merges the symbol and
   /// local variables and then takes the composition of `other` on `this: (B ->
   /// C)`. The resulting relation represents tuples of the form: `A -> C`.
diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
index d082d2d9f758b..18349d071bb2e 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -65,8 +65,9 @@ using ForwardSliceOptions = SliceOptions;
 ///
 /// The implementation traverses the use chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `forwardSlice`, no
-/// need to traverse its uses again. Since use-def chains form a DAG, this
-/// terminates.
+/// need to traverse its uses again. In the presence of use-def cycles in a
+/// graph region, the traversal stops at the first operation that was already
+/// visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `forwardSlice` is filled with a
 /// postorder list of uses (i.e. a reverse topological order). To get a proper
@@ -114,8 +115,9 @@ void getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 ///
 /// The implementation traverses the def chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `backwardSlice`, no
-/// need to traverse its definitions again. Since useuse-def chains form a DAG,
-/// this terminates.
+/// need to traverse its definitions again. In the presence of use-def cycles
+/// in a graph region, the traversal stops at the first operation that was
+/// already visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `backwardSlice` is filled with a
 /// postorder list of defs. This happens to be a topological order, from the
diff --git a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h
index 2dd35c097c796..1428d5ccf00f4 100644
--- a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h
+++ b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h
@@ -20,6 +20,7 @@
 #define MLIR_BINDINGS_PYTHON_NANOBINDADAPTORS_H
 
 #include <cstdint>
+#include <optional>
 
 #include "mlir-c/Diagnostics.h"
 #include "mlir-c/IR.h"
@@ -43,18 +44,14 @@ namespace detail {
 /// with a raw handle (unowned). The returned object's lifetime may not extend
 /// beyond the apiObject handle without explicitly having its refcount increased
 /// (i.e. on return).
-static nanobind::object mlirApiObjectToCapsule(nanobind::handle apiObject) {
+static std::optional<nanobind::object>
+mlirApiObjectToCapsule(nanobind::handle apiObject) {
   if (PyCapsule_CheckExact(apiObject.ptr()))
     return nanobind::borrow<nanobind::object>(apiObject);
   nanobind::object api =
       nanobind::getattr(apiObject, MLIR_PYTHON_CAPI_PTR_ATTR, nanobind::none());
-  if (api.is_none()) {
-    std::string repr = nanobind::cast<std::string>(nanobind::repr(apiObject));
-    throw nanobind::type_error(
-        (llvm::Twine("Expected an MLIR object (got ") + repr + ").")
-            .str()
-            .c_str());
-  }
+  if (api.is_none())
+    return {};
   return api;
 }
 
@@ -67,13 +64,12 @@ static nanobind::object mlirApiObjectToCapsule(nanobind::handle apiObject) {
 template <>
 struct type_caster<MlirAffineMap> {
   NB_TYPE_CASTER(MlirAffineMap, const_name("MlirAffineMap"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToAffineMap(capsule.ptr());
-    if (mlirAffineMapIsNull(value)) {
-      return false;
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToAffineMap(capsule->ptr());
+      return !mlirAffineMapIsNull(value);
     }
-    return !mlirAffineMapIsNull(value);
+    return false;
   }
   static handle from_cpp(MlirAffineMap v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -90,10 +86,12 @@ struct type_caster<MlirAffineMap> {
 template <>
 struct type_caster<MlirAttribute> {
   NB_TYPE_CASTER(MlirAttribute, const_name("MlirAttribute"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToAttribute(capsule.ptr());
-    return !mlirAttributeIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToAttribute(capsule->ptr());
+      return !mlirAttributeIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirAttribute v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -111,10 +109,12 @@ struct type_caster<MlirAttribute> {
 template <>
 struct type_caster<MlirBlock> {
   NB_TYPE_CASTER(MlirBlock, const_name("MlirBlock"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToBlock(capsule.ptr());
-    return !mlirBlockIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToBlock(capsule->ptr());
+      return !mlirBlockIsNull(value);
+    }
+    return false;
   }
 };
 
@@ -122,7 +122,7 @@ struct type_caster<MlirBlock> {
 template <>
 struct type_caster<MlirContext> {
   NB_TYPE_CASTER(MlirContext, const_name("MlirContext"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (src.is_none()) {
       // Gets the current thread-bound context.
       // TODO: This raises an error of "No current context" currently.
@@ -132,8 +132,8 @@ struct type_caster<MlirContext> {
                 .attr("Context")
                 .attr("current");
     }
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToContext(capsule.ptr());
+    std::optional<nanobind::object> capsule = mlirApiObjectToCapsule(src);
+    value = mlirPythonCapsuleToContext(capsule->ptr());
     return !mlirContextIsNull(value);
   }
 };
@@ -142,10 +142,12 @@ struct type_caster<MlirContext> {
 template <>
 struct type_caster<MlirDialectRegistry> {
   NB_TYPE_CASTER(MlirDialectRegistry, const_name("MlirDialectRegistry"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToDialectRegistry(capsule.ptr());
-    return !mlirDialectRegistryIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToDialectRegistry(capsule->ptr());
+      return !mlirDialectRegistryIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirDialectRegistry v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -162,16 +164,18 @@ struct type_caster<MlirDialectRegistry> {
 template <>
 struct type_caster<MlirLocation> {
   NB_TYPE_CASTER(MlirLocation, const_name("MlirLocation"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (src.is_none()) {
       // Gets the current thread-bound context.
       src = nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))
                 .attr("Location")
                 .attr("current");
     }
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToLocation(capsule.ptr());
-    return !mlirLocationIsNull(value);
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToLocation(capsule->ptr());
+      return !mlirLocationIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirLocation v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -188,10 +192,12 @@ struct type_caster<MlirLocation> {
 template <>
 struct type_caster<MlirModule> {
   NB_TYPE_CASTER(MlirModule, const_name("MlirModule"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToModule(capsule.ptr());
-    return !mlirModuleIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToModule(capsule->ptr());
+      return !mlirModuleIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirModule v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -209,12 +215,15 @@ template <>
 struct type_caster<MlirFrozenRewritePatternSet> {
   NB_TYPE_CASTER(MlirFrozenRewritePatternSet,
                  const_name("MlirFrozenRewritePatternSet"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToFrozenRewritePatternSet(capsule.ptr());
-    return value.ptr != nullptr;
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToFrozenRewritePatternSet(capsule->ptr());
+      return value.ptr != nullptr;
+    }
+    return false;
   }
-  static handle from_cpp(MlirFrozenRewritePatternSet v, rv_policy, handle) {
+  static handle from_cpp(MlirFrozenRewritePatternSet v, rv_policy,
+                         handle) noexcept {
     nanobind::object capsule = nanobind::steal<nanobind::object>(
         mlirPythonFrozenRewritePatternSetToCapsule(v));
     return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("rewrite"))
@@ -228,10 +237,12 @@ struct type_caster<MlirFrozenRewritePatternSet> {
 template <>
 struct type_caster<MlirOperation> {
   NB_TYPE_CASTER(MlirOperation, const_name("MlirOperation"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToOperation(capsule.ptr());
-    return !mlirOperationIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToOperation(capsule->ptr());
+      return !mlirOperationIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirOperation v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -250,10 +261,12 @@ struct type_caster<MlirOperation> {
 template <>
 struct type_caster<MlirValue> {
   NB_TYPE_CASTER(MlirValue, const_name("MlirValue"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToValue(capsule.ptr());
-    return !mlirValueIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToValue(capsule->ptr());
+      return !mlirValueIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirValue v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -273,10 +286,12 @@ struct type_caster<MlirValue> {
 template <>
 struct type_caster<MlirPassManager> {
   NB_TYPE_CASTER(MlirPassManager, const_name("MlirPassManager"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToPassManager(capsule.ptr());
-    return !mlirPassManagerIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToPassManager(capsule->ptr());
+      return !mlirPassManagerIsNull(value);
+    }
+    return false;
   }
 };
 
@@ -284,10 +299,12 @@ struct type_caster<MlirPassManager> {
 template <>
 struct type_caster<MlirTypeID> {
   NB_TYPE_CASTER(MlirTypeID, const_name("MlirTypeID"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToTypeID(capsule.ptr());
-    return !mlirTypeIDIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToTypeID(capsule->ptr());
+      return !mlirTypeIDIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirTypeID v, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -306,10 +323,12 @@ struct type_caster<MlirTypeID> {
 template <>
 struct type_caster<MlirType> {
   NB_TYPE_CASTER(MlirType, const_name("MlirType"))
-  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) {
-    nanobind::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToType(capsule.ptr());
-    return !mlirTypeIsNull(value);
+  bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
+    if (auto capsule = mlirApiObjectToCapsule(src)) {
+      value = mlirPythonCapsuleToType(capsule->ptr());
+      return !mlirTypeIsNull(value);
+    }
+    return false;
   }
   static handle from_cpp(MlirType t, rv_policy,
                          cleanup_list *cleanup) noexcept {
@@ -462,9 +481,10 @@ class mlir_attribute_subclass : public pure_subclass {
     nanobind::object newCf = nanobind::cpp_function(
         [superCls, isaFunction, captureTypeName](
             nanobind::object cls, nanobind::object otherAttribute) {
-          MlirAttribute rawAttribute =
-              nanobind::cast<MlirAttribute>(otherAttribute);
-          if (!isaFunction(rawAttribute)) {
+          MlirAttribute rawAttribute;
+          if (!nanobind::try_cast<MlirAttribute>(otherAttribute,
+                                                 rawAttribute) ||
+              !isaFunction(rawAttribute)) {
             auto origRepr =
                 nanobind::cast<std::string>(nanobind::repr(otherAttribute));
             throw std::invalid_argument(
@@ -543,8 +563,9 @@ class mlir_type_subclass : public pure_subclass {
     nanobind::object newCf = nanobind::cpp_function(
         [superCls, isaFunction, captureTypeName](nanobind::object cls,
                                                  nanobind::object otherType) {
-          MlirType rawType = nanobind::cast<MlirType>(otherType);
-          if (!isaFunction(rawType)) {
+          MlirType rawType;
+          if (!nanobind::try_cast<MlirType>(otherType, rawType) ||
+              !isaFunction(rawType)) {
             auto origRepr =
                 nanobind::cast<std::string>(nanobind::repr(otherType));
             throw std::invalid_argument((llvm::Twine("Cannot cast type to ") +
@@ -625,8 +646,9 @@ class mlir_value_subclass : public pure_subclass {
     nanobind::object newCf = nanobind::cpp_function(
         [superCls, isaFunction, captureValueName](nanobind::object cls,
                                                   nanobind::object otherValue) {
-          MlirValue rawValue = nanobind::cast<MlirValue>(otherValue);
-          if (!isaFunction(rawValue)) {
+          MlirValue rawValue;
+          if (!nanobind::try_cast<MlirValue>(otherValue, rawValue) ||
+              !isaFunction(rawValue)) {
             auto origRepr =
                 nanobind::cast<std::string>(nanobind::repr(otherValue));
             throw std::invalid_argument((llvm::Twine("Cannot cast value to ") +
diff --git a/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h b/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h
new file mode 100644
index 0000000000000..daac2a99ed80f
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h
@@ -0,0 +1,27 @@
+//===- ComplexToROCDLLibraryCalls.h - convert from Complex to ROCDL calls -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_
+#define MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_
+
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+class RewritePatternSet;
+
+#define GEN_PASS_DECL_CONVERTCOMPLEXTOROCDLLIBRARYCALLS
+#include "mlir/Conversion/Passes.h.inc"
+
+/// Populate the given list with patterns that convert from Complex to ROCDL
+/// calls.
+void populateComplexToROCDLLibraryCallsConversionPatterns(
+    RewritePatternSet &patterns);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 8a5976e547169..d93fbefab74aa 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -23,6 +23,7 @@
 #include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
 #include "mlir/Conversion/ComplexToLibm/ComplexToLibm.h"
+#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h"
 #include "mlir/Conversion/ComplexToSPIRV/ComplexToSPIRVPass.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 50c67da91a4af..76e751243a12c 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -312,6 +312,18 @@ def ConvertComplexToLibm : Pass<"convert-complex-to-libm", "ModuleOp"> {
   let dependentDialects = ["func::FuncDialect"];
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexToROCDLLibraryCalls
+//===----------------------------------------------------------------------===//
+
+def ConvertComplexToROCDLLibraryCalls : Pass<"convert-complex-to-rocdl-library-calls", "ModuleOp"> {
+  let summary = "Convert Complex dialect to ROCDL library calls";
+  let description = [{
+    This pass converts supported Complex ops to calls to the AMD device library.
+  }];
+  let dependentDialects = ["func::FuncDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index eadb5d9326798..80959ffbaf426 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp
   let summary = "Extend a vector of packed floating point values";
 
   let description = [{
-    Extend and scale two packed floats in `source[index]` to two floats and 
+    Extend and scale two packed floats in `source[index]` to two floats and
     return them.
 
     This rather unusual signature arises from the fact that AMD GPUs cannot
@@ -861,7 +861,7 @@ def AMDGPU_WMMAOp :
 }
 
 def AMDGPU_GatherToLDSOp :
-    AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>,
+    AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
     Arguments<(ins
                    Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src,
                    Variadic<Index>:$srcIndices,
@@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp :
     order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
 
     This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
-    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and 
-    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile 
-    size. 
-    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` 
+    - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
+    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
+    size.
+    - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
     are omitted from this wrapper.
-    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for 
-    double-precision operations on gfx94x and so are not included here. 
+    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
+    double-precision operations on gfx94x and so are not included here.
   }];
   let assemblyFormat = [{
     `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h
index 4134aef8174bc..3e4b8648061ff 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h
@@ -81,13 +81,13 @@ LogicalResult getIndexSet(MutableArrayRef<Operation *> ops,
 /// Encapsulates a memref load or store access information.
 struct MemRefAccess {
   Value memref;
-  Operation *opInst;
+  Operation *opInst = nullptr;
   SmallVector<Value, 4> indices;
 
-  /// Constructs a MemRefAccess from a load or store operation.
-  // TODO: add accessors to standard op's load, store, DMA op's to return
-  // MemRefAccess, i.e., loadOp->getAccess(), dmaOp->getRead/WriteAccess.
-  explicit MemRefAccess(Operation *opInst);
+  /// Constructs a MemRefAccess from an affine read/write operation.
+  explicit MemRefAccess(Operation *memOp);
+
+  MemRefAccess() = default;
 
   // Returns the rank of the memref associated with this access.
   unsigned getRank() const;
@@ -126,10 +126,12 @@ struct MemRefAccess {
   /// time (considering the memrefs, their respective affine access maps  and
   /// operands). The equality of access functions + operands is checked by
   /// subtracting fully composed value maps, and then simplifying the difference
-  /// using the expression flattener.
-  /// TODO: this does not account for aliasing of memrefs.
+  /// using the expression flattener. This does not account for aliasing of
+  /// memrefs.
   bool operator==(const MemRefAccess &rhs) const;
   bool operator!=(const MemRefAccess &rhs) const { return !(*this == rhs); }
+
+  explicit operator bool() const { return !!memref; }
 };
 
 // DependenceComponent contains state about the direction of a dependence as an
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index f175b15c8770f..271b42025e0af 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -401,7 +401,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
     SameOperandsAndResultElementType,
     Bufferization_TensorAndBufferMatch<"result", "buffer">
   ]> {
-  let summary = "create a buffer-like type from a tensor-like type";
+  let summary = "create a tensor-like type from a buffer-like type";
   let description = [{
     An operation that creates a tensor from a buffer. The result value is a
     tensor-like type that must match the corresponding buffer-like operand as
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
index 63e007cdc335c..e355bb8f5ddae 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -223,6 +223,9 @@ Value createGlobalString(Location loc, OpBuilder &builder, StringRef name,
 /// function confirms that the Operation has the desired properties.
 bool satisfiesLLVMModule(Operation *op);
 
+/// Lookup parent Module satisfying LLVM conditions on the Module Operation.
+Operation *parentLLVMModule(Operation *op);
+
 /// Convert an array of integer attributes to a vector of integers that can be
 /// used as indices in LLVM operations.
 template <typename IntT = int64_t>
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index f4c1640098320..4a9bc90e43d96 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1285,6 +1285,10 @@ def LLVM_AddressOfOp : LLVM_Op<"mlir.addressof",
     /// Return the llvm.mlir.alias operation that defined the value referenced
     /// here.
     AliasOp getAlias(SymbolTableCollection &symbolTable);
+
+    /// Return the llvm.mlir.ifunc operation that defined the value referenced
+    /// here.
+    IFuncOp getIFunc(SymbolTableCollection &symbolTable);
   }];
 
   let assemblyFormat = "$global_name attr-dict `:` qualified(type($res))";
@@ -1601,6 +1605,67 @@ def LLVM_AliasOp : LLVM_Op<"mlir.alias",
   let hasRegionVerifier = 1;
 }
 
+def LLVM_IFuncOp : LLVM_Op<"mlir.ifunc",
+    [IsolatedFromAbove, Symbol, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let arguments = (ins
+    SymbolNameAttr:$sym_name,
+    TypeAttr:$i_func_type,
+    FlatSymbolRefAttr:$resolver,
+    TypeAttr:$resolver_type,
+    Linkage:$linkage,
+    UnitAttr:$dso_local,
+    DefaultValuedAttr<ConfinedAttr<I32Attr, [IntNonNegative]>, "0">:$address_space,
+    DefaultValuedAttr<UnnamedAddr, "mlir::LLVM::UnnamedAddr::None">:$unnamed_addr,
+    DefaultValuedAttr<Visibility, "mlir::LLVM::Visibility::Default">:$visibility_
+  );
+  let summary = "LLVM dialect ifunc";
+  let description = [{
+    `llvm.mlir.ifunc` is a top level operation that defines a global ifunc.
+    It defines a new symbol and takes a symbol refering to a resolver function.
+    IFuncs can be called as regular functions. The function type is the same
+    as the IFuncType. The symbol is resolved at runtime by calling a resolver
+    function.
+
+    Examples:
+
+    ```mlir
+    // IFuncs resolve a symbol at runtime using a resovler function.
+    llvm.mlir.ifunc external @foo: !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+
+    llvm.func @foo_1(i64) -> f32
+    llvm.func @foo_2(i64) -> f32
+
+    llvm.func @resolve_foo() -> !llvm.ptr attributes {
+      %0 = llvm.mlir.addressof @foo_2 : !llvm.ptr
+      %1 = llvm.mlir.addressof @foo_1 : !llvm.ptr
+
+      // ... Logic selecting from foo_{1, 2}
+
+      // Return function pointer to the selected function
+      llvm.return %7 : !llvm.ptr
+    }
+
+    llvm.func @use_foo() {
+      // IFuncs are called as regular functions
+      %res = llvm.call @foo(%value) : i64 -> f32
+    }
+    ```
+  }];
+
+  let builders = [
+    OpBuilder<(ins "StringRef":$name, "Type":$i_func_type,
+      "StringRef":$resolver, "Type":$resolver_type,
+      "Linkage":$linkage, "LLVM::Visibility":$visibility)>
+  ];
+
+  let assemblyFormat = [{
+    custom<LLVMLinkage>($linkage) ($visibility_^)? ($unnamed_addr^)?
+    $sym_name `:` $i_func_type `,` $resolver_type $resolver attr-dict
+  }];
+  let hasVerifier = 1;
+}
+
+
 def LLVM_DSOLocalEquivalentOp : LLVM_Op<"dso_local_equivalent",
     [Pure, ConstantLike, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let arguments = (ins FlatSymbolRefAttr:$function_name);
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index b4dde776822a1..bafeca924e4c5 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -2431,12 +2431,11 @@ def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
   }];
 
   let arguments = (ins TransformHandleTypeInterface:$target,
-                       Variadic<TransformAnyParamTypeOrAnyHandle>:$vector_sizes,
-                       DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:
-                          $static_vector_sizes,
-                       OptionalAttr<UnitAttr>:$vectorize_nd_extract,
-                       DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:
-                          $scalable_sizes);
+      Variadic<TransformAnyParamTypeOrAnyHandle>:$vector_sizes,
+      DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_vector_sizes,
+      OptionalAttr<UnitAttr>:$vectorize_nd_extract,
+      OptionalAttr<UnitAttr>:$assume_dynamic_dims_match_vec_sizes,
+      DefaultValuedOptionalAttr<DenseBoolArrayAttr, "{}">:$scalable_sizes);
 
   let results = (outs);
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 74280fdd82f4e..9e62d0dcc7890 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -880,7 +880,8 @@ FailureOr<VectorizationResult>
 vectorize(RewriterBase &rewriter, Operation *op,
           ArrayRef<int64_t> inputVectorSizes = {},
           ArrayRef<bool> inputScalableVecDims = {},
-          bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false);
+          bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false,
+          bool assumeDynamicDimsMatchVecSizes = false);
 
 /// Emit a suitable vector form for a Copy op with fully static shape.
 LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp);
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 09bb3932ef293..9321089ab55fa 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1216,6 +1216,11 @@ def LoadOp : MemRef_Op<"load",
     be reused in the cache. For details, refer to the
     [https://llvm.org/docs/LangRef.html#load-instruction](LLVM load instruction).
 
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    load operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
     Example:
 
     ```mlir
@@ -1226,7 +1231,39 @@ def LoadOp : MemRef_Op<"load",
   let arguments = (ins Arg<AnyMemRef, "the reference to load from",
                            [MemRead]>:$memref,
                        Variadic<Index>:$indices,
-                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal);
+                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+                       ConfinedAttr<OptionalAttr<I64Attr>,
+                                    [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+
+  let builders = [
+    OpBuilder<(ins "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
+    OpBuilder<(ins "Type":$resultType,
+                   "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultType, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
+    OpBuilder<(ins "TypeRange":$resultTypes,
+                   "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultTypes, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>
+  ];
+
   let results = (outs AnyType:$result);
 
   let extraClassDeclaration = [{
@@ -1912,6 +1949,11 @@ def MemRef_StoreOp : MemRef_Op<"store",
     be reused in the cache. For details, refer to the
     [https://llvm.org/docs/LangRef.html#store-instruction](LLVM store instruction).
 
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    store operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
     Example:
 
     ```mlir
@@ -1923,13 +1965,25 @@ def MemRef_StoreOp : MemRef_Op<"store",
                        Arg<AnyMemRef, "the reference to store to",
                            [MemWrite]>:$memref,
                        Variadic<Index>:$indices,
-                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal);
+                       DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+                       ConfinedAttr<OptionalAttr<I64Attr>,
+                                    [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
 
   let builders = [
+    OpBuilder<(ins "Value":$valueToStore,
+                   "Value":$memref,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, valueToStore, memref, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
     OpBuilder<(ins "Value":$valueToStore, "Value":$memref), [{
       $_state.addOperands(valueToStore);
       $_state.addOperands(memref);
-    }]>];
+    }]>
+  ];
 
   let extraClassDeclaration = [{
       Value getValueToStore() { return getOperand(0); }
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
index 3f1041cb25103..243dbf081b999 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
@@ -62,9 +62,11 @@ void populateAllReduceEndomorphismSimplificationPatterns(
   auto isEndomorphismOp = [reduction](Operation *op,
                                       std::optional<Operation *> referenceOp) {
     auto allReduceOp = llvm::dyn_cast<AllReduceOp>(op);
+    if (!allReduceOp)
+      return false;
     auto inType = cast<ShapedType>(allReduceOp.getInput().getType());
     auto outType = cast<ShapedType>(allReduceOp.getResult().getType());
-    if (!allReduceOp || inType.getElementType() != outType.getElementType() ||
+    if (inType.getElementType() != outType.getElementType() ||
         allReduceOp.getReduction() != reduction) {
       return false;
     }
@@ -87,9 +89,7 @@ void populateAllReduceEndomorphismSimplificationPatterns(
     return refAllReduceOp->getAttrs() == allReduceOp->getAttrs() &&
            inType.getElementType() == refType.getElementType();
   };
-  auto isAlgebraicOp = [](Operation *op) {
-    return static_cast<bool>(llvm::dyn_cast<AlgebraicOp>(op));
-  };
+  auto isAlgebraicOp = [](Operation *op) { return isa<AlgebraicOp>(op); };
 
   using ConcreteEndomorphismSimplification = EndomorphismSimplification<
       std::decay_t<decltype(getEndomorphismOpOperand)>,
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 4eb666239d4e4..8f87235fcd237 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -29,6 +29,7 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <variant>
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/OpenACC/OpenACCOpsTypes.h.inc"
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 66378f116784e..96b9adcc53b3c 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2772,8 +2772,10 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
   }];
 
   let arguments = (ins SymbolNameAttr:$sym_name, SymbolRefAttr:$func_name,
-      OptionalAttr<StrArrayAttr>:$bindName,
-      OptionalAttr<DeviceTypeArrayAttr>:$bindNameDeviceType,
+      OptionalAttr<SymbolRefArrayAttr>:$bindIdName,
+      OptionalAttr<StrArrayAttr>:$bindStrName,
+      OptionalAttr<DeviceTypeArrayAttr>:$bindIdNameDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$bindStrNameDeviceType,
       OptionalAttr<DeviceTypeArrayAttr>:$worker,
       OptionalAttr<DeviceTypeArrayAttr>:$vector,
       OptionalAttr<DeviceTypeArrayAttr>:$seq, UnitAttr:$nohost,
@@ -2815,14 +2817,14 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
     std::optional<int64_t> getGangDimValue();
     std::optional<int64_t> getGangDimValue(mlir::acc::DeviceType deviceType);
 
-    std::optional<llvm::StringRef> getBindNameValue();
-    std::optional<llvm::StringRef> getBindNameValue(mlir::acc::DeviceType deviceType);
+    std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue();
+    std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     $sym_name `func` `(` $func_name `)`
     oilist (
-        `bind` `(` custom<BindName>($bindName, $bindNameDeviceType) `)`
+        `bind` `(` custom<BindName>($bindIdName, $bindStrName ,$bindIdNameDeviceType, $bindStrNameDeviceType) `)`
       | `gang` `` custom<RoutineGangClause>($gang, $gangDim, $gangDimDeviceType)
       | `worker` custom<DeviceTypeArrayAttr>($worker)
       | `vector` custom<DeviceTypeArrayAttr>($vector)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 16c14ef085d6d..311c57fb4446c 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -22,6 +22,31 @@
 include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
 include "mlir/IR/SymbolInterfaces.td"
 
+//===----------------------------------------------------------------------===//
+// V5.2: [6.3] `align` clause
+//===----------------------------------------------------------------------===//
+
+class OpenMP_AlignClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
+                    extraClassDeclaration> {
+  let arguments = (ins
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntPositive]>:$align
+  );
+
+  let optAssemblyFormat = [{
+    `align` `(` $align `)`
+  }];
+
+  let description = [{
+    The `align` clause is used to specify the byte alignment to use for
+    allocations associated with the construct on which the clause appears.
+  }];
+}
+
+def OpenMP_AlignClause : OpenMP_AlignClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // V5.2: [5.11] `aligned` clause
 //===----------------------------------------------------------------------===//
@@ -84,6 +109,32 @@ class OpenMP_AllocateClauseSkip<
 
 def OpenMP_AllocateClause : OpenMP_AllocateClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+// V5.2: [6.4] `allocator` clause
+//===----------------------------------------------------------------------===//
+
+class OpenMP_AllocatorClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
+                    extraClassDeclaration> {
+
+  let arguments = (ins
+    OptionalAttr<AllocatorHandleAttr>:$allocator
+  );
+
+  let optAssemblyFormat = [{
+    `allocator` `(` custom<ClauseAttr>($allocator) `)`
+  }];
+
+  let description = [{
+    `allocator` specifies the memory allocator to be used for allocations
+    associated with the construct on which the clause appears.
+  }];
+}
+
+def OpenMP_AllocatorClause : OpenMP_AllocatorClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // LLVM OpenMP extension `ompx_bare` clause
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
index 9dbe6897a3304..c080c3fac87d4 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
@@ -263,4 +263,34 @@ def VariableCaptureKindAttr : OpenMP_EnumAttr<VariableCaptureKind,
   let assemblyFormat = "`(` $value `)`";
 }
 
+
+//===----------------------------------------------------------------------===//
+// allocator_handle enum.
+//===----------------------------------------------------------------------===//
+
+def OpenMP_AllocatorHandleNullAllocator : I32EnumAttrCase<"omp_null_allocator", 0>;
+def OpenMP_AllocatorHandleDefaultMemAlloc : I32EnumAttrCase<"omp_default_mem_alloc", 1>;
+def OpenMP_AllocatorHandleLargeCapMemAlloc : I32EnumAttrCase<"omp_large_cap_mem_alloc", 2>;
+def OpenMP_AllocatorHandleConstMemAlloc : I32EnumAttrCase<"omp_const_mem_alloc", 3>;
+def OpenMP_AllocatorHandleHighBwMemAlloc : I32EnumAttrCase<"omp_high_bw_mem_alloc", 4>;
+def OpenMP_AllocatorHandleLowLatMemAlloc : I32EnumAttrCase<"omp_low_lat_mem_alloc", 5>;
+def OpenMP_AllocatorHandleCgroupMemAlloc : I32EnumAttrCase<"omp_cgroup_mem_alloc", 6>;
+def OpenMP_AllocatorHandlePteamMemAlloc : I32EnumAttrCase<"omp_pteam_mem_alloc", 7>;
+def OpenMP_AllocatorHandlethreadMemAlloc : I32EnumAttrCase<"omp_thread_mem_alloc", 8>;
+
+def AllocatorHandle : OpenMP_I32EnumAttr<
+    "AllocatorHandle",
+    "OpenMP allocator_handle", [
+      OpenMP_AllocatorHandleNullAllocator,
+      OpenMP_AllocatorHandleDefaultMemAlloc,
+      OpenMP_AllocatorHandleLargeCapMemAlloc,
+      OpenMP_AllocatorHandleConstMemAlloc,
+      OpenMP_AllocatorHandleHighBwMemAlloc,
+      OpenMP_AllocatorHandleLowLatMemAlloc,
+      OpenMP_AllocatorHandleCgroupMemAlloc,
+      OpenMP_AllocatorHandlePteamMemAlloc,
+      OpenMP_AllocatorHandlethreadMemAlloc
+    ]>;
+
+def AllocatorHandleAttr : OpenMP_EnumAttr<AllocatorHandle, "allocator_handle">;
 #endif // OPENMP_ENUMS
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index e4f52777d8aa2..8cf18b43450ab 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2090,4 +2090,27 @@ def MaskedOp : OpenMP_Op<"masked", clauses = [
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// [Spec 5.2] 6.5 allocate Directive
+//===----------------------------------------------------------------------===//
+def AllocateDirOp : OpenMP_Op<"allocate_dir", clauses = [
+    OpenMP_AlignClause, OpenMP_AllocatorClause
+  ]> {
+  let summary = "allocate directive";
+  let description = [{
+    The storage for each list item that appears in the allocate directive is
+    provided an allocation through the memory allocator.
+  }] # clausesDescription;
+
+  let arguments = !con((ins Variadic<AnyType>:$varList),
+                       clausesArgs);
+
+  // Override inherited assembly format to include `varList`.
+  let assemblyFormat = " `(` $varList `:` type($varList) `)` oilist(" #
+                       clausesOptAssemblyFormat #
+                       ") attr-dict ";
+
+  let hasVerifier = 1;
+}
+
 #endif // OPENMP_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index 1f614d63cb54f..90383265002a3 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -359,6 +359,7 @@ def SPV_EXT_shader_atomic_float_min_max  : I32EnumAttrCase<"SPV_EXT_shader_atomi
 def SPV_EXT_shader_image_int64           : I32EnumAttrCase<"SPV_EXT_shader_image_int64", 1010>;
 def SPV_EXT_shader_atomic_float16_add    : I32EnumAttrCase<"SPV_EXT_shader_atomic_float16_add", 1011>;
 def SPV_EXT_mesh_shader                  : I32EnumAttrCase<"SPV_EXT_mesh_shader", 1012>;
+def SPV_EXT_replicated_composites        : I32EnumAttrCase<"SPV_EXT_replicated_composites", 1013>;
 
 def SPV_AMD_gpu_shader_half_float_fetch          : I32EnumAttrCase<"SPV_AMD_gpu_shader_half_float_fetch", 2000>;
 def SPV_AMD_shader_ballot                        : I32EnumAttrCase<"SPV_AMD_shader_ballot", 2001>;
@@ -446,7 +447,7 @@ def SPIRV_ExtensionAttr :
       SPV_EXT_shader_stencil_export, SPV_EXT_shader_viewport_index_layer,
       SPV_EXT_shader_atomic_float_add, SPV_EXT_shader_atomic_float_min_max,
       SPV_EXT_shader_image_int64, SPV_EXT_shader_atomic_float16_add,
-      SPV_EXT_mesh_shader,
+      SPV_EXT_mesh_shader, SPV_EXT_replicated_composites,
       SPV_ARM_tensors,
       SPV_AMD_gpu_shader_half_float_fetch, SPV_AMD_shader_ballot,
       SPV_AMD_shader_explicit_vertex_parameter, SPV_AMD_shader_fragment_mask,
@@ -849,6 +850,12 @@ def SPIRV_C_CooperativeMatrixKHR                        : I32EnumAttrCase<"Coope
     MinVersion<SPIRV_V_1_6>
   ];
 }
+def SPIRV_C_ReplicatedCompositesEXT                     : I32EnumAttrCase<"ReplicatedCompositesEXT", 6024> {
+  list<Availability> availability = [
+    Extension<[SPV_EXT_replicated_composites]>,
+    MinVersion<SPIRV_V_1_0>
+  ];
+}
 def SPIRV_C_BitInstructions                             : I32EnumAttrCase<"BitInstructions", 6025> {
   list<Availability> availability = [
     Extension<[SPV_KHR_bit_instructions]>
@@ -1500,7 +1507,7 @@ def SPIRV_CapabilityAttr :
       SPIRV_C_USMStorageClassesINTEL, SPIRV_C_IOPipesINTEL, SPIRV_C_BlockingPipesINTEL,
       SPIRV_C_FPGARegINTEL, SPIRV_C_DotProductInputAll,
       SPIRV_C_DotProductInput4x8BitPacked, SPIRV_C_DotProduct, SPIRV_C_RayCullMaskKHR,
-      SPIRV_C_CooperativeMatrixKHR,
+      SPIRV_C_CooperativeMatrixKHR, SPIRV_C_ReplicatedCompositesEXT,
       SPIRV_C_BitInstructions, SPIRV_C_AtomicFloat32AddEXT, SPIRV_C_AtomicFloat64AddEXT,
       SPIRV_C_LongConstantCompositeINTEL, SPIRV_C_OptNoneINTEL,
       SPIRV_C_AtomicFloat16AddEXT, SPIRV_C_DebugInfoModuleINTEL, SPIRV_C_SplitBarrierINTEL,
@@ -4565,6 +4572,8 @@ def SPIRV_OC_OpCooperativeMatrixLoadKHR       : I32EnumAttrCase<"OpCooperativeMa
 def SPIRV_OC_OpCooperativeMatrixStoreKHR      : I32EnumAttrCase<"OpCooperativeMatrixStoreKHR", 4458>;
 def SPIRV_OC_OpCooperativeMatrixMulAddKHR     : I32EnumAttrCase<"OpCooperativeMatrixMulAddKHR", 4459>;
 def SPIRV_OC_OpCooperativeMatrixLengthKHR     : I32EnumAttrCase<"OpCooperativeMatrixLengthKHR", 4460>;
+def SPIRV_OC_OpConstantCompositeReplicateEXT : I32EnumAttrCase<"OpConstantCompositeReplicateEXT", 4461>;
+def SPIRV_OC_OpSpecConstantCompositeReplicateEXT : I32EnumAttrCase<"OpSpecConstantCompositeReplicateEXT", 4462>;
 def SPIRV_OC_OpEmitMeshTasksEXT               : I32EnumAttrCase<"OpEmitMeshTasksEXT", 5294>;
 def SPIRV_OC_OpSetMeshOutputsEXT              : I32EnumAttrCase<"OpSetMeshOutputsEXT", 5295>;
 def SPIRV_OC_OpSubgroupBlockReadINTEL         : I32EnumAttrCase<"OpSubgroupBlockReadINTEL", 5575>;
@@ -4674,6 +4683,8 @@ def SPIRV_OpcodeAttr :
       SPIRV_OC_OpSUDotAccSat, SPIRV_OC_OpTypeCooperativeMatrixKHR,
       SPIRV_OC_OpCooperativeMatrixLoadKHR, SPIRV_OC_OpCooperativeMatrixStoreKHR,
       SPIRV_OC_OpCooperativeMatrixMulAddKHR, SPIRV_OC_OpCooperativeMatrixLengthKHR,
+      SPIRV_OC_OpConstantCompositeReplicateEXT,
+      SPIRV_OC_OpSpecConstantCompositeReplicateEXT,
       SPIRV_OC_OpEmitMeshTasksEXT, SPIRV_OC_OpSetMeshOutputsEXT,
       SPIRV_OC_OpSubgroupBlockReadINTEL, SPIRV_OC_OpSubgroupBlockWriteINTEL,
       SPIRV_OC_OpAssumeTrueKHR, SPIRV_OC_OpAtomicFAddEXT,
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td
index c5a85f881b35e..7986025d6ca31 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td
@@ -135,6 +135,47 @@ def SPIRV_ConstantOp : SPIRV_Op<"Constant",
   let autogenSerialization = 0;
 }
 
+
+// -----
+
+def SPIRV_EXTConstantCompositeReplicateOp : SPIRV_ExtVendorOp<"ConstantCompositeReplicate", [Pure]> {
+  let summary = [{
+    Declare a new replicated composite constant op.
+  }];
+
+  let description = [{
+    Represents a splat composite constant i.e., all elements of composite constant
+    have the same value.
+
+    #### Example:
+
+    ```mlir
+    %0 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : vector<2xi32>
+    %1 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<2 x vector<2xi32>>
+    %2 = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+    ```
+  }];
+
+  let availability = [
+    MinVersion<SPIRV_V_1_0>,
+    MaxVersion<SPIRV_V_1_6>,
+    Extension<[SPV_EXT_replicated_composites]>,
+    Capability<[SPIRV_C_ReplicatedCompositesEXT]>
+  ];
+
+  let arguments = (ins
+    AnyAttr:$value
+  );
+
+  let results = (outs
+    SPIRV_Composite:$replicated_constant
+  );
+
+  let autogenSerialization = 0;
+
+  let assemblyFormat = "` ` `[` $value `]` `:` type($replicated_constant) attr-dict";
+}
+
 // -----
 
 def SPIRV_EntryPointOp : SPIRV_Op<"EntryPoint", [InModuleScope]> {
@@ -689,6 +730,43 @@ def SPIRV_SpecConstantCompositeOp : SPIRV_Op<"SpecConstantComposite", [
 
 // -----
 
+def SPIRV_EXTSpecConstantCompositeReplicateOp : SPIRV_ExtVendorOp<"SpecConstantCompositeReplicate", [InModuleScope, Symbol]> {
+  let summary = "Declare a new replicated composite specialization constant op.";
+
+  let description = [{
+    Represents a splat spec composite constant i.e., all elements of spec composite
+    constant have the same value. The splat value must come from a symbol reference
+    of spec constant instruction.
+
+    #### Example:
+
+    ```mlir
+    spirv.SpecConstant @sc_i32_1 = 1 : i32
+    spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_i32 (@sc_i32_1) : !spirv.array<3 x i32>
+    spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_i32 (@sc_i32_1) : !spirv.struct<(i32, i32, i32)>
+    ```
+  }];
+
+  let availability = [
+    MinVersion<SPIRV_V_1_0>,
+    MaxVersion<SPIRV_V_1_6>,
+    Extension<[SPV_EXT_replicated_composites]>,
+    Capability<[SPIRV_C_ReplicatedCompositesEXT]>
+  ];
+
+  let arguments = (ins
+    TypeAttr:$type,
+    StrAttr:$sym_name,
+    SymbolRefAttr:$constituent
+  );
+
+  let results = (outs);
+
+  let autogenSerialization = 0;
+}
+
+// -----
+
 def SPIRV_SpecConstantOperationOp : SPIRV_Op<"SpecConstantOperation", [
        Pure, InFunctionScope,
        SingleBlockImplicitTerminator<"YieldOp">]> {
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index c77da91942ef9..1f718accabd15 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -433,8 +433,6 @@ extensionComplianceMap = {
       {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
       {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
       {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.cond_if", {{{Extension::controlflow}, {{boolT}}}}},
-    {"tosa.while_loop", {{{Extension::controlflow}, {{boolT}}}}},
     {"tosa.variable", {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
     {"tosa.variable_write",
      {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index cbe490f6e4dd1..e07188a1a04bf 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1809,12 +1809,42 @@ def Vector_LoadOp : Vector_Op<"load", [
     ```mlir
     %result = vector.load %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
+
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    load operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
   }];
 
   let arguments = (ins Arg<AnyMemRef, "the reference to load from",
       [MemRead]>:$base,
       Variadic<Index>:$indices,
-      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal);
+      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+      ConfinedAttr<OptionalAttr<I64Attr>,
+                   [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+
+  let builders = [
+    OpBuilder<(ins "VectorType":$resultType,
+                   "Value":$base,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultType, base, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>,
+    OpBuilder<(ins "TypeRange":$resultTypes,
+                   "Value":$base,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, resultTypes, base, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>
+  ];
+
   let results = (outs AnyVectorOfAnyRank:$result);
 
   let extraClassDeclaration = [{
@@ -1895,6 +1925,12 @@ def Vector_StoreOp : Vector_Op<"store", [
     ```mlir
     vector.store %valueToStore, %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
+
+    An optional `alignment` attribute allows to specify the byte alignment of the
+    store operation. It must be a positive power of 2. The operation must access
+    memory at an address aligned to this boundary. Violations may lead to
+    architecture-specific faults or performance penalties.
+    A value of 0 indicates no specific alignment requirement.
   }];
 
   let arguments = (ins
@@ -1902,8 +1938,21 @@ def Vector_StoreOp : Vector_Op<"store", [
       Arg<AnyMemRef, "the reference to store to",
       [MemWrite]>:$base,
       Variadic<Index>:$indices,
-      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal
-  );
+      DefaultValuedOptionalAttr<BoolAttr, "false">:$nontemporal,
+      ConfinedAttr<OptionalAttr<I64Attr>,
+                   [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment);
+
+  let builders = [
+    OpBuilder<(ins "Value":$valueToStore,
+                   "Value":$base,
+                   "ValueRange":$indices,
+                   CArg<"bool", "false">:$nontemporal,
+                   CArg<"uint64_t", "0">:$alignment), [{
+      return build($_builder, $_state, valueToStore, base, indices, nontemporal,
+                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
+                                    nullptr);
+    }]>
+  ];
 
   let extraClassDeclaration = [{
     MemRefType getMemRefType() {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index bd5ea9fd83781..81e25f7537cb0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -110,23 +110,34 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     Variadic<Index>: $offsets,
     Variadic<Index>: $shape,
     Variadic<Index>: $strides,
-    DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
     OptionalAttr<DenseI64ArrayAttr>: $const_shape,
     OptionalAttr<DenseI64ArrayAttr>: $const_strides
   );
-  let results = (outs XeGPU_TensorDesc: $TensorDesc);
 
   let assemblyFormat = [{
     $source ``
-    custom<DynamicIndexList>($offsets, $const_offsets)
-    (`,` custom<DynamicIndexList>($shape, $const_shape)^
-     `,` custom<DynamicIndexList>($strides, $const_strides))?
+    custom<OptionalDynamicIndexList>($offsets, $const_offsets)
+    (`,` `shape` `:` custom<DynamicIndexList>($shape, $const_shape)^
+     `,` `strides``:` custom<DynamicIndexList>($strides, $const_strides))?
     attr-dict `:` type($source) `->` qualified(type($TensorDesc))
   }];
 
+  let results = (outs XeGPU_TensorDesc: $TensorDesc);
+
   let hasVerifier = 1;
 
   let builders = [
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source)>,
+
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType> ": $source,
+                   "llvm::ArrayRef<OpFoldResult>": $shape,
+                   "llvm::ArrayRef<OpFoldResult>": $strides)>,
+
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
+                   "llvm::ArrayRef<OpFoldResult>": $shape,
+                   "llvm::ArrayRef<OpFoldResult>": $strides)>,
+
     OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
                    "llvm::ArrayRef<OpFoldResult>": $offsets)>,
 
@@ -163,7 +174,17 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     }
 
     ArrayRef<int64_t> getStaticOffsets(){
-      return getConstOffsets();
+      auto attr = getConstOffsetsAttr();
+
+      if (attr) 
+        return attr;
+
+      int64_t rank = getMixedSizes().size();
+      
+      setConstOffsets(llvm::SmallVector<int64_t, 4>(rank, 0));
+
+      attr = getConstOffsetsAttr();
+      return attr;
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
@@ -172,10 +193,16 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// and `const_shape` will be used to represent the shape of
     /// source operand. They overide static shape from source memref type.
     ArrayRef<int64_t> getStaticSizes() {
+      /// To be compatible with OffsetSizeAndStrideOpInterface, which expects valid return value and perform checks
+      static  llvm::SmallVector<int64_t, 4> emptyShape;
+
       auto attr = getConstShapeAttr();
-      if (llvm::isa<IntegerType>(getSourceType()) || attr)
+      if (attr)
         return attr;
 
+      if (llvm::isa<IntegerType>(getSourceType()))
+        return emptyShape;
+
       auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticSizes");
       return memrefType.getShape();
@@ -187,9 +214,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// and `const_strides` will be used to represent the strides of
     /// source operand. They overide static strides from source memref type.
     ArrayRef<int64_t> getStaticStrides() {
+      /// To be compatible with OffsetSizeAndStrideOpInterface, which expects valid return value and perform checks
+      static llvm::SmallVector<int64_t, 4> emptyStrides;
+
       auto attr = getConstStridesAttr();
-      if (llvm::isa<IntegerType>(getSourceType()) || attr)
+      if (attr)
         return attr;
+      
+      if (llvm::isa<IntegerType>(getSourceType()))
+        return emptyStrides;
 
       auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticStrides");
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 6fea10185402a..488f358ff3802 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -76,6 +76,17 @@ LayoutAttr getLayoutAttr(const Value value);
 /// it will check the operand itself and its defining op.
 LayoutAttr getLayoutAttr(const OpOperand &opr);
 
+/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+                                      std::is_same_v<T, OpResult>>>
+void removeLayoutAttr(const T &operandOrResult);
+
+/// Removes the LayoutAttr for each OpOperand and OpResult of the given
+/// operation if they exist. If the operation contains regions, it is also
+/// applied recursively to the contained operations
+void removeLayoutAttrs(Operation *op);
+
 /// Sets the LayoutAttr for a given OpOperand or OpResult by attaching
 /// it to the owner's dictionary attributes
 template <typename T,
diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td
index e91a13fea5c7f..18da85a580710 100644
--- a/mlir/include/mlir/IR/CommonAttrConstraints.td
+++ b/mlir/include/mlir/IR/CommonAttrConstraints.td
@@ -796,6 +796,10 @@ def IntPositive : AttrConstraint<
     CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getValue().isStrictlyPositive()">,
     "whose value is positive">;
 
+def IntPowerOf2 : AttrConstraint<
+    CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getValue().isPowerOf2()">,
+    "whose value is a power of two > 0">;
+
 class ArrayMaxCount<int n> : AttrConstraint<
     CPred<"::llvm::cast<::mlir::ArrayAttr>($_self).size() <= " # n>,
     "with at most " # n # " elements">;
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 43ef28624fb19..9e5fb5659a22b 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -22,7 +22,7 @@ include "mlir/IR/Utils.td"
 include "mlir/IR/AttrTypeBase.td"
 
 //===----------------------------------------------------------------------===//
-// OpTrait definitions
+// *OpTrait definitions
 //===----------------------------------------------------------------------===//
 
 // A trait that describes the structure of operation will be marked with
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index afeb784b85a12..3a2dbd136b438 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -475,6 +475,25 @@ class RewriterBase : public OpBuilder {
     RewriterBase::Listener *rewriteListener;
   };
 
+  /// A listener that logs notification events to llvm::dbgs() before
+  /// forwarding to the base listener.
+  struct PatternLoggingListener : public RewriterBase::ForwardingListener {
+    PatternLoggingListener(OpBuilder::Listener *listener, StringRef patternName)
+        : RewriterBase::ForwardingListener(listener), patternName(patternName) {
+    }
+
+    void notifyOperationInserted(Operation *op, InsertPoint previous) override;
+    void notifyOperationModified(Operation *op) override;
+    void notifyOperationReplaced(Operation *op, Operation *newOp) override;
+    void notifyOperationReplaced(Operation *op,
+                                 ValueRange replacement) override;
+    void notifyOperationErased(Operation *op) override;
+    void notifyPatternBegin(const Pattern &pattern, Operation *op) override;
+
+  private:
+    StringRef patternName;
+  };
+
   /// Move the blocks that belong to "region" before the given position in
   /// another region "parent". The two regions must be different. The caller
   /// is responsible for creating or updating the operation transferring flow
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index 9902c6bb15caf..c484072ffaa80 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -71,6 +71,9 @@ class ModuleImport {
   /// Converts all aliases of the LLVM module to MLIR variables.
   LogicalResult convertAliases();
 
+  /// Converts all ifuncs of the LLVM module to MLIR variables.
+  LogicalResult convertIFuncs();
+
   /// Converts the data layout of the LLVM module to an MLIR data layout
   /// specification.
   LogicalResult convertDataLayout();
@@ -320,6 +323,8 @@ class ModuleImport {
   /// Converts an LLVM global alias variable into an MLIR LLVM dialect alias
   /// operation if a conversion exists. Otherwise, returns failure.
   LogicalResult convertAlias(llvm::GlobalAlias *alias);
+  // Converts an LLVM global ifunc into an MLIR LLVM dialect ifunc operation.
+  LogicalResult convertIFunc(llvm::GlobalIFunc *ifunc);
   /// Returns personality of `func` as a FlatSymbolRefAttr.
   FlatSymbolRefAttr getPersonalityAsAttr(llvm::Function *func);
   /// Imports `bb` into `block`, which must be initially empty.
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 5d52cf3f04b6a..f3f73f49f199a 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -260,6 +260,12 @@ class ModuleTranslation {
     return aliasesMapping.lookup(op);
   }
 
+  /// Finds an LLVM IR global value that corresponds to the given MLIR operation
+  /// defining an IFunc.
+  llvm::GlobalValue *lookupIFunc(Operation *op) {
+    return ifuncMapping.lookup(op);
+  }
+
   /// Returns the OpenMP IR builder associated with the LLVM IR module being
   /// constructed.
   llvm::OpenMPIRBuilder *getOpenMPBuilder();
@@ -345,6 +351,7 @@ class ModuleTranslation {
                                  bool recordInsertions = false);
   LogicalResult convertFunctionSignatures();
   LogicalResult convertFunctions();
+  LogicalResult convertIFuncs();
   LogicalResult convertComdats();
 
   LogicalResult convertUnresolvedBlockAddress();
@@ -406,6 +413,10 @@ class ModuleTranslation {
   /// aliases.
   DenseMap<Operation *, llvm::GlobalValue *> aliasesMapping;
 
+  /// Mappings between llvm.mlir.ifunc definitions and corresponding global
+  /// ifuncs.
+  DenseMap<Operation *, llvm::GlobalValue *> ifuncMapping;
+
   /// A stateful object used to translate types.
   TypeToLLVMIRTranslator typeTranslator;
 
diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index 17e48e0d069b7..5c4d4d13580a0 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -2481,6 +2481,44 @@ void IntegerRelation::applyDomain(const IntegerRelation &rel) {
 
 void IntegerRelation::applyRange(const IntegerRelation &rel) { compose(rel); }
 
+IntegerRelation IntegerRelation::rangeProduct(const IntegerRelation &rel) {
+  /// R1: (i, j) -> k : f(i, j, k) = 0
+  /// R2: (i, j) -> l : g(i, j, l) = 0
+  /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0
+  assert(getNumDomainVars() == rel.getNumDomainVars() &&
+         "Range product is only defined for relations with equal domains");
+
+  // explicit copy of `this`
+  IntegerRelation result = *this;
+  unsigned relRangeVarStart = rel.getVarKindOffset(VarKind::Range);
+  unsigned numThisRangeVars = getNumRangeVars();
+  unsigned numNewSymbolVars = result.getNumSymbolVars() - getNumSymbolVars();
+
+  result.appendVar(VarKind::Range, rel.getNumRangeVars());
+
+  // Copy each equality from `rel` and update the copy to account for range
+  // variables from `this`. The `rel` equality is a list of coefficients of the
+  // variables from `rel`, and so the range variables need to be shifted right
+  // by the number of `this` range variables and symbols.
+  for (unsigned i = 0; i < rel.getNumEqualities(); ++i) {
+    SmallVector<DynamicAPInt> copy =
+        SmallVector<DynamicAPInt>(rel.getEquality(i));
+    copy.insert(copy.begin() + relRangeVarStart,
+                numThisRangeVars + numNewSymbolVars, DynamicAPInt(0));
+    result.addEquality(copy);
+  }
+
+  for (unsigned i = 0; i < rel.getNumInequalities(); ++i) {
+    SmallVector<DynamicAPInt> copy =
+        SmallVector<DynamicAPInt>(rel.getInequality(i));
+    copy.insert(copy.begin() + relRangeVarStart,
+                numThisRangeVars + numNewSymbolVars, DynamicAPInt(0));
+    result.addInequality(copy);
+  }
+
+  return result;
+}
+
 void IntegerRelation::printSpace(raw_ostream &os) const {
   space.print(os);
   os << getNumConstraints() << " constraints\n";
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index 36a9812bd7972..991c71e3f689a 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -26,7 +26,8 @@
 using namespace mlir;
 
 static void
-getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
+getForwardSliceImpl(Operation *op, DenseSet<Operation *> &visited,
+                    SetVector<Operation *> *forwardSlice,
                     const SliceOptions::TransitiveFilter &filter = nullptr) {
   if (!op)
     return;
@@ -40,20 +41,41 @@ getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
   for (Region &region : op->getRegions())
     for (Block &block : region)
       for (Operation &blockOp : block)
-        if (forwardSlice->count(&blockOp) == 0)
-          getForwardSliceImpl(&blockOp, forwardSlice, filter);
-  for (Value result : op->getResults()) {
-    for (Operation *userOp : result.getUsers())
-      if (forwardSlice->count(userOp) == 0)
-        getForwardSliceImpl(userOp, forwardSlice, filter);
-  }
+        if (forwardSlice->count(&blockOp) == 0) {
+          // We don't have to check if the 'blockOp' is already visited because
+          // there cannot be a traversal path from this nested op to the parent
+          // and thus a cycle cannot be closed here. We still have to mark it
+          // as visited to stop before visiting this operation again if it is
+          // part of a cycle.
+          visited.insert(&blockOp);
+          getForwardSliceImpl(&blockOp, visited, forwardSlice, filter);
+          visited.erase(&blockOp);
+        }
+
+  for (Value result : op->getResults())
+    for (Operation *userOp : result.getUsers()) {
+      // A cycle can only occur within a basic block (not across regions or
+      // basic blocks) because the parent region must be a graph region, graph
+      // regions are restricted to always have 0 or 1 blocks, and there cannot
+      // be a def-use edge from a nested operation to an operation in an
+      // ancestor region. Therefore, we don't have to but may use the same
+      // 'visited' set across regions/blocks as long as we remove operations
+      // from the set again when the DFS traverses back from the leaf to the
+      // root.
+      if (forwardSlice->count(userOp) == 0 && visited.insert(userOp).second)
+        getForwardSliceImpl(userOp, visited, forwardSlice, filter);
+
+      visited.erase(userOp);
+    }
 
   forwardSlice->insert(op);
 }
 
 void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
                            const ForwardSliceOptions &options) {
-  getForwardSliceImpl(op, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  getForwardSliceImpl(op, visited, forwardSlice, options.filter);
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
     // want it in the results.
@@ -69,8 +91,12 @@ void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
 
 void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
                            const SliceOptions &options) {
-  for (Operation *user : root.getUsers())
-    getForwardSliceImpl(user, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  for (Operation *user : root.getUsers()) {
+    visited.insert(user);
+    getForwardSliceImpl(user, visited, forwardSlice, options.filter);
+    visited.erase(user);
+  }
 
   // Reverse to get back the actual topological order.
   // std::reverse does not work out of the box on SetVector and I want an
@@ -80,6 +106,7 @@ void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 }
 
 static LogicalResult getBackwardSliceImpl(Operation *op,
+                                          DenseSet<Operation *> &visited,
                                           SetVector<Operation *> *backwardSlice,
                                           const BackwardSliceOptions &options) {
   if (!op || op->hasTrait<OpTrait::IsIsolatedFromAbove>())
@@ -93,8 +120,12 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 
   auto processValue = [&](Value value) {
     if (auto *definingOp = value.getDefiningOp()) {
-      if (backwardSlice->count(definingOp) == 0)
-        return getBackwardSliceImpl(definingOp, backwardSlice, options);
+      if (backwardSlice->count(definingOp) == 0 &&
+          visited.insert(definingOp).second)
+        return getBackwardSliceImpl(definingOp, visited, backwardSlice,
+                                    options);
+
+      visited.erase(definingOp);
     } else if (auto blockArg = dyn_cast<BlockArgument>(value)) {
       if (options.omitBlockArguments)
         return success();
@@ -107,7 +138,8 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
       if (parentOp && backwardSlice->count(parentOp) == 0) {
         if (parentOp->getNumRegions() == 1 &&
             llvm::hasSingleElement(parentOp->getRegion(0).getBlocks())) {
-          return getBackwardSliceImpl(parentOp, backwardSlice, options);
+          return getBackwardSliceImpl(parentOp, visited, backwardSlice,
+                                      options);
         }
       }
     } else {
@@ -145,7 +177,10 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 LogicalResult mlir::getBackwardSlice(Operation *op,
                                      SetVector<Operation *> *backwardSlice,
                                      const BackwardSliceOptions &options) {
-  LogicalResult result = getBackwardSliceImpl(op, backwardSlice, options);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  LogicalResult result =
+      getBackwardSliceImpl(op, visited, backwardSlice, options);
 
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index d961482885300..7b790e90e0d87 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -97,6 +97,10 @@ static const char kOperationPrintDocstring[] =
   binary: Whether to write bytes (True) or str (False). Defaults to False.
   large_elements_limit: Whether to elide elements attributes above this
     number of elements. Defaults to None (no limit).
+  large_resource_limit: Whether to elide resource attributes above this
+    number of characters. Defaults to None (no limit). If large_elements_limit
+    is set and this is None, the behavior will be to use large_elements_limit
+    as large_resource_limit.
   enable_debug_info: Whether to print debug/location information. Defaults
     to False.
   pretty_debug_info: Whether to format debug information for easier reading
@@ -1303,6 +1307,7 @@ void PyOperation::checkValid() const {
 }
 
 void PyOperationBase::print(std::optional<int64_t> largeElementsLimit,
+                            std::optional<int64_t> largeResourceLimit,
                             bool enableDebugInfo, bool prettyDebugInfo,
                             bool printGenericOpForm, bool useLocalScope,
                             bool useNameLocAsPrefix, bool assumeVerified,
@@ -1314,10 +1319,10 @@ void PyOperationBase::print(std::optional<int64_t> largeElementsLimit,
     fileObject = nb::module_::import_("sys").attr("stdout");
 
   MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate();
-  if (largeElementsLimit) {
+  if (largeElementsLimit)
     mlirOpPrintingFlagsElideLargeElementsAttrs(flags, *largeElementsLimit);
-    mlirOpPrintingFlagsElideLargeResourceString(flags, *largeElementsLimit);
-  }
+  if (largeResourceLimit)
+    mlirOpPrintingFlagsElideLargeResourceString(flags, *largeResourceLimit);
   if (enableDebugInfo)
     mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true,
                                        /*prettyForm=*/prettyDebugInfo);
@@ -1405,6 +1410,7 @@ void PyOperationBase::walk(
 
 nb::object PyOperationBase::getAsm(bool binary,
                                    std::optional<int64_t> largeElementsLimit,
+                                   std::optional<int64_t> largeResourceLimit,
                                    bool enableDebugInfo, bool prettyDebugInfo,
                                    bool printGenericOpForm, bool useLocalScope,
                                    bool useNameLocAsPrefix, bool assumeVerified,
@@ -1416,6 +1422,7 @@ nb::object PyOperationBase::getAsm(bool binary,
     fileObject = nb::module_::import_("io").attr("StringIO")();
   }
   print(/*largeElementsLimit=*/largeElementsLimit,
+        /*largeResourceLimit=*/largeResourceLimit,
         /*enableDebugInfo=*/enableDebugInfo,
         /*prettyDebugInfo=*/prettyDebugInfo,
         /*printGenericOpForm=*/printGenericOpForm,
@@ -3348,6 +3355,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           [](PyOperationBase &self) {
             return self.getAsm(/*binary=*/false,
                                /*largeElementsLimit=*/std::nullopt,
+                               /*largeResourceLimit=*/std::nullopt,
                                /*enableDebugInfo=*/false,
                                /*prettyDebugInfo=*/false,
                                /*printGenericOpForm=*/false,
@@ -3363,11 +3371,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("state"), nb::arg("file").none() = nb::none(),
            nb::arg("binary") = false, kOperationPrintStateDocstring)
       .def("print",
-           nb::overload_cast<std::optional<int64_t>, bool, bool, bool, bool,
-                             bool, bool, nb::object, bool, bool>(
-               &PyOperationBase::print),
+           nb::overload_cast<std::optional<int64_t>, std::optional<int64_t>,
+                             bool, bool, bool, bool, bool, bool, nb::object,
+                             bool, bool>(&PyOperationBase::print),
            // Careful: Lots of arguments must match up with print method.
            nb::arg("large_elements_limit").none() = nb::none(),
+           nb::arg("large_resource_limit").none() = nb::none(),
            nb::arg("enable_debug_info") = false,
            nb::arg("pretty_debug_info") = false,
            nb::arg("print_generic_op_form") = false,
@@ -3383,6 +3392,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            // Careful: Lots of arguments must match up with get_asm method.
            nb::arg("binary") = false,
            nb::arg("large_elements_limit").none() = nb::none(),
+           nb::arg("large_resource_limit").none() = nb::none(),
            nb::arg("enable_debug_info") = false,
            nb::arg("pretty_debug_info") = false,
            nb::arg("print_generic_op_form") = false,
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 9befcce725bb7..0fdd2d1a7eff6 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -599,18 +599,18 @@ class PyOperationBase {
 public:
   virtual ~PyOperationBase() = default;
   /// Implements the bound 'print' method and helps with others.
-  void print(std::optional<int64_t> largeElementsLimit, bool enableDebugInfo,
+  void print(std::optional<int64_t> largeElementsLimit,
+             std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
              bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope,
              bool useNameLocAsPrefix, bool assumeVerified,
              nanobind::object fileObject, bool binary, bool skipRegions);
   void print(PyAsmState &state, nanobind::object fileObject, bool binary);
 
-  nanobind::object getAsm(bool binary,
-                          std::optional<int64_t> largeElementsLimit,
-                          bool enableDebugInfo, bool prettyDebugInfo,
-                          bool printGenericOpForm, bool useLocalScope,
-                          bool useNameLocAsPrefix, bool assumeVerified,
-                          bool skipRegions);
+  nanobind::object
+  getAsm(bool binary, std::optional<int64_t> largeElementsLimit,
+         std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
+         bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope,
+         bool useNameLocAsPrefix, bool assumeVerified, bool skipRegions);
 
   // Implement the bound 'writeBytecode' method.
   void writeBytecode(const nanobind::object &fileObject,
diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp
index 858c3bd5745fe..20017e25b69bb 100644
--- a/mlir/lib/Bindings/Python/Pass.cpp
+++ b/mlir/lib/Bindings/Python/Pass.cpp
@@ -78,12 +78,19 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
           [](PyPassManager &passManager, bool printBeforeAll,
              bool printAfterAll, bool printModuleScope, bool printAfterChange,
              bool printAfterFailure, std::optional<int64_t> largeElementsLimit,
-             bool enableDebugInfo, bool printGenericOpForm,
+             std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
+             bool printGenericOpForm,
              std::optional<std::string> optionalTreePrintingPath) {
             MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate();
-            if (largeElementsLimit)
+            if (largeElementsLimit) {
               mlirOpPrintingFlagsElideLargeElementsAttrs(flags,
                                                          *largeElementsLimit);
+              mlirOpPrintingFlagsElideLargeResourceString(flags,
+                                                          *largeElementsLimit);
+            }
+            if (largeResourceLimit)
+              mlirOpPrintingFlagsElideLargeResourceString(flags,
+                                                          *largeResourceLimit);
             if (enableDebugInfo)
               mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true,
                                                  /*prettyForm=*/false);
@@ -103,6 +110,7 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
           "print_module_scope"_a = false, "print_after_change"_a = false,
           "print_after_failure"_a = false,
           "large_elements_limit"_a.none() = nb::none(),
+          "large_resource_limit"_a.none() = nb::none(),
           "enable_debug_info"_a = false, "print_generic_op_form"_a = false,
           "tree_printing_dir_path"_a.none() = nb::none(),
           "Enable IR printing, default as mlir-print-ir-after-all.")
@@ -112,6 +120,12 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
             mlirPassManagerEnableVerifier(passManager.get(), enable);
           },
           "enable"_a, "Enable / disable verify-each.")
+      .def(
+          "enable_timing",
+          [](PyPassManager &passManager) {
+            mlirPassManagerEnableTiming(passManager.get());
+          },
+          "Enable pass timing.")
       .def_static(
           "parse",
           [](const std::string &pipeline, DefaultingPyMlirContext context) {
diff --git a/mlir/lib/CAPI/IR/Pass.cpp b/mlir/lib/CAPI/IR/Pass.cpp
index 883b7e8bb832d..3c499c3e4974d 100644
--- a/mlir/lib/CAPI/IR/Pass.cpp
+++ b/mlir/lib/CAPI/IR/Pass.cpp
@@ -75,6 +75,10 @@ void mlirPassManagerEnableVerifier(MlirPassManager passManager, bool enable) {
   unwrap(passManager)->enableVerifier(enable);
 }
 
+void mlirPassManagerEnableTiming(MlirPassManager passManager) {
+  unwrap(passManager)->enableTiming();
+}
+
 MlirOpPassManager mlirPassManagerGetNestedUnder(MlirPassManager passManager,
                                                 MlirStringRef operationName) {
   return wrap(&unwrap(passManager)->nest(unwrap(operationName)));
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 24a48993ad80c..f84375b6b8d6a 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(AsyncToLLVM)
 add_subdirectory(BufferizationToMemRef)
 add_subdirectory(ComplexCommon)
 add_subdirectory(ComplexToLibm)
+add_subdirectory(ComplexToROCDLLibraryCalls)
 add_subdirectory(ComplexToLLVM)
 add_subdirectory(ComplexToSPIRV)
 add_subdirectory(ComplexToStandard)
diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt
new file mode 100644
index 0000000000000..695bb2dd0a82c
--- /dev/null
+++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRComplexToROCDLLibraryCalls
+  ComplexToROCDLLibraryCalls.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ComplexToROCDLLibraryCalls
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRComplexDialect
+  MLIRFuncDialect
+  MLIRPass
+  MLIRTransformUtils
+  )
diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
new file mode 100644
index 0000000000000..99d5424aef79a
--- /dev/null
+++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
@@ -0,0 +1,92 @@
+//=== ComplexToROCDLLibraryCalls.cpp - convert from Complex to ROCDL calls ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTCOMPLEXTOROCDLLIBRARYCALLS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+
+template <typename Op, typename FloatTy>
+// Pattern to convert Complex ops to ROCDL function calls.
+struct ComplexOpToROCDLLibraryCalls : public OpRewritePattern<Op> {
+  using OpRewritePattern<Op>::OpRewritePattern;
+  ComplexOpToROCDLLibraryCalls(MLIRContext *context, StringRef funcName,
+                               PatternBenefit benefit = 1)
+      : OpRewritePattern<Op>(context, benefit), funcName(funcName) {}
+
+  LogicalResult matchAndRewrite(Op op, PatternRewriter &rewriter) const final {
+    Operation *symTable = SymbolTable::getNearestSymbolTable(op);
+    Type resType = op.getType();
+    if (auto complexType = dyn_cast<ComplexType>(resType))
+      resType = complexType.getElementType();
+    if (!isa<FloatTy>(resType))
+      return failure();
+
+    auto opFunc = dyn_cast_or_null<SymbolOpInterface>(
+        SymbolTable::lookupSymbolIn(symTable, funcName));
+    if (!opFunc) {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(&symTable->getRegion(0).front());
+      auto funcTy = FunctionType::get(
+          rewriter.getContext(), op->getOperandTypes(), op->getResultTypes());
+      opFunc = rewriter.create<func::FuncOp>(rewriter.getUnknownLoc(), funcName,
+                                             funcTy);
+      opFunc.setPrivate();
+    }
+    rewriter.replaceOpWithNewOp<func::CallOp>(op, funcName, op.getType(),
+                                              op->getOperands());
+    return success();
+  }
+
+private:
+  std::string funcName;
+};
+} // namespace
+
+void mlir::populateComplexToROCDLLibraryCallsConversionPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::AbsOp, Float32Type>>(
+      patterns.getContext(), "__ocml_cabs_f32");
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::AbsOp, Float64Type>>(
+      patterns.getContext(), "__ocml_cabs_f64");
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::ExpOp, Float32Type>>(
+      patterns.getContext(), "__ocml_cexp_f32");
+  patterns.add<ComplexOpToROCDLLibraryCalls<complex::ExpOp, Float64Type>>(
+      patterns.getContext(), "__ocml_cexp_f64");
+}
+
+namespace {
+struct ConvertComplexToROCDLLibraryCallsPass
+    : public impl::ConvertComplexToROCDLLibraryCallsBase<
+          ConvertComplexToROCDLLibraryCallsPass> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void ConvertComplexToROCDLLibraryCallsPass::runOnOperation() {
+  Operation *op = getOperation();
+
+  RewritePatternSet patterns(&getContext());
+  populateComplexToROCDLLibraryCallsConversionPatterns(patterns);
+
+  ConversionTarget target(getContext());
+  target.addLegalDialect<func::FuncDialect>();
+  target.addIllegalOp<complex::AbsOp, complex::ExpOp>();
+  if (failed(applyPartialConversion(op, target, std::move(patterns))))
+    signalPassFailure();
+}
diff --git a/mlir/lib/Debug/BreakpointManagers/FileLineColLocBreakpointManager.cpp b/mlir/lib/Debug/BreakpointManagers/FileLineColLocBreakpointManager.cpp
index 594aa11128611..4ad3e7f22a19a 100644
--- a/mlir/lib/Debug/BreakpointManagers/FileLineColLocBreakpointManager.cpp
+++ b/mlir/lib/Debug/BreakpointManagers/FileLineColLocBreakpointManager.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Debug/BreakpointManagers/FileLineColLocBreakpointManager.h"
-#include "mlir/IR/Diagnostics.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace mlir::tracing;
diff --git a/mlir/lib/Debug/ExecutionContext.cpp b/mlir/lib/Debug/ExecutionContext.cpp
index f7505b6608c81..f311d6d0f44f0 100644
--- a/mlir/lib/Debug/ExecutionContext.cpp
+++ b/mlir/lib/Debug/ExecutionContext.cpp
@@ -11,8 +11,6 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/FormatVariadic.h"
 
-#include <cstddef>
-
 using namespace mlir;
 using namespace mlir::tracing;
 
diff --git a/mlir/lib/Debug/Observers/ActionProfiler.cpp b/mlir/lib/Debug/Observers/ActionProfiler.cpp
index a6b7d5e18aa07..88e6ae211a2d4 100644
--- a/mlir/lib/Debug/Observers/ActionProfiler.cpp
+++ b/mlir/lib/Debug/Observers/ActionProfiler.cpp
@@ -9,8 +9,6 @@
 #include "mlir/Debug/Observers/ActionProfiler.h"
 #include "mlir/Debug/BreakpointManager.h"
 #include "mlir/IR/Action.h"
-#include "mlir/Rewrite/PatternApplicator.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 #include <chrono>
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index acaf6a2f8792a..88c2eb3326d96 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -134,6 +134,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 3;
   if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
@@ -142,6 +144,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
 }
 
 static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
   if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
     return intMemorySpace.getInt() == 7;
   if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
diff --git a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
index 8bdb4c3593335..4739290bf6e4b 100644
--- a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
@@ -1550,15 +1550,17 @@ mlir::affine::computeSliceUnion(ArrayRef<Operation *> opsA,
   FlatAffineValueConstraints sliceUnionCst;
   assert(sliceUnionCst.getNumDimAndSymbolVars() == 0);
   std::vector<std::pair<Operation *, Operation *>> dependentOpPairs;
-  for (Operation *i : opsA) {
-    MemRefAccess srcAccess(i);
-    for (Operation *j : opsB) {
-      MemRefAccess dstAccess(j);
+  MemRefAccess srcAccess;
+  MemRefAccess dstAccess;
+  for (Operation *a : opsA) {
+    srcAccess = MemRefAccess(a);
+    for (Operation *b : opsB) {
+      dstAccess = MemRefAccess(b);
       if (srcAccess.memref != dstAccess.memref)
         continue;
       // Check if 'loopDepth' exceeds nesting depth of src/dst ops.
-      if ((!isBackwardSlice && loopDepth > getNestingDepth(i)) ||
-          (isBackwardSlice && loopDepth > getNestingDepth(j))) {
+      if ((!isBackwardSlice && loopDepth > getNestingDepth(a)) ||
+          (isBackwardSlice && loopDepth > getNestingDepth(b))) {
         LLVM_DEBUG(llvm::dbgs() << "Invalid loop depth\n");
         return SliceComputationResult::GenericFailure;
       }
@@ -1577,13 +1579,12 @@ mlir::affine::computeSliceUnion(ArrayRef<Operation *> opsA,
       }
       if (result.value == DependenceResult::NoDependence)
         continue;
-      dependentOpPairs.emplace_back(i, j);
+      dependentOpPairs.emplace_back(a, b);
 
       // Compute slice bounds for 'srcAccess' and 'dstAccess'.
       ComputationSliceState tmpSliceState;
-      mlir::affine::getComputationSliceState(i, j, dependenceConstraints,
-                                             loopDepth, isBackwardSlice,
-                                             &tmpSliceState);
+      getComputationSliceState(a, b, dependenceConstraints, loopDepth,
+                               isBackwardSlice, &tmpSliceState);
 
       if (sliceUnionCst.getNumDimAndSymbolVars() == 0) {
         // Initialize 'sliceUnionCst' with the bounds computed in previous step.
@@ -1948,16 +1949,16 @@ AffineForOp mlir::affine::insertBackwardComputationSlice(
 
 // Constructs  MemRefAccess populating it with the memref, its indices and
 // opinst from 'loadOrStoreOpInst'.
-MemRefAccess::MemRefAccess(Operation *loadOrStoreOpInst) {
-  if (auto loadOp = dyn_cast<AffineReadOpInterface>(loadOrStoreOpInst)) {
+MemRefAccess::MemRefAccess(Operation *memOp) {
+  if (auto loadOp = dyn_cast<AffineReadOpInterface>(memOp)) {
     memref = loadOp.getMemRef();
-    opInst = loadOrStoreOpInst;
+    opInst = memOp;
     llvm::append_range(indices, loadOp.getMapOperands());
   } else {
-    assert(isa<AffineWriteOpInterface>(loadOrStoreOpInst) &&
+    assert(isa<AffineWriteOpInterface>(memOp) &&
            "Affine read/write op expected");
-    auto storeOp = cast<AffineWriteOpInterface>(loadOrStoreOpInst);
-    opInst = loadOrStoreOpInst;
+    auto storeOp = cast<AffineWriteOpInterface>(memOp);
+    opInst = memOp;
     memref = storeOp.getMemRef();
     llvm::append_range(indices, storeOp.getMapOperands());
   }
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index 64eccc76a6642..4558d827e8563 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -313,9 +313,9 @@ def IndexCastUIOfExtUI :
 // BitcastOp
 //===----------------------------------------------------------------------===//
 
-// bitcast(bitcast(x)) -> x
+// bitcast(type1, bitcast(type2, x)) -> bitcast(type1, x)
 def BitcastOfBitcast :
-    Pat<(Arith_BitcastOp (Arith_BitcastOp $x)), (replaceWithValue $x)>;
+    Pat<(Arith_BitcastOp (Arith_BitcastOp $x)), (Arith_BitcastOp $x)>;
 
 //===----------------------------------------------------------------------===//
 // ExtSIOp
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 62dce32bc4531..4a1527cd0369f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -130,6 +130,17 @@ static RetTy parseOptionalLLVMKeyword(OpAsmParser &parser,
   return static_cast<RetTy>(index);
 }
 
+static void printLLVMLinkage(OpAsmPrinter &p, Operation *, LinkageAttr val) {
+  p << stringifyLinkage(val.getLinkage());
+}
+
+static ParseResult parseLLVMLinkage(OpAsmParser &p, LinkageAttr &val) {
+  val = LinkageAttr::get(
+      p.getContext(),
+      parseOptionalLLVMKeyword<LLVM::Linkage>(p, LLVM::Linkage::External));
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Operand bundle helpers.
 //===----------------------------------------------------------------------===//
@@ -1166,14 +1177,17 @@ LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
       return emitOpError()
              << "'" << calleeName.getValue()
              << "' does not reference a symbol in the current scope";
-    auto fn = dyn_cast<LLVMFuncOp>(callee);
-    if (!fn)
-      return emitOpError() << "'" << calleeName.getValue()
-                           << "' does not reference a valid LLVM function";
-
-    if (failed(verifyCallOpDebugInfo(*this, fn)))
-      return failure();
-    fnType = fn.getFunctionType();
+    if (auto fn = dyn_cast<LLVMFuncOp>(callee)) {
+      if (failed(verifyCallOpDebugInfo(*this, fn)))
+        return failure();
+      fnType = fn.getFunctionType();
+    } else if (auto ifunc = dyn_cast<IFuncOp>(callee)) {
+      fnType = ifunc.getIFuncType();
+    } else {
+      return emitOpError()
+             << "'" << calleeName.getValue()
+             << "' does not reference a valid LLVM function or IFunc";
+    }
   }
 
   LLVMFunctionType funcType = llvm::dyn_cast<LLVMFunctionType>(fnType);
@@ -2029,14 +2043,6 @@ LogicalResult ReturnOp::verify() {
 // LLVM::AddressOfOp.
 //===----------------------------------------------------------------------===//
 
-static Operation *parentLLVMModule(Operation *op) {
-  Operation *module = op->getParentOp();
-  while (module && !satisfiesLLVMModule(module))
-    module = module->getParentOp();
-  assert(module && "unexpected operation outside of a module");
-  return module;
-}
-
 GlobalOp AddressOfOp::getGlobal(SymbolTableCollection &symbolTable) {
   return dyn_cast_or_null<GlobalOp>(
       symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr()));
@@ -2052,6 +2058,11 @@ AliasOp AddressOfOp::getAlias(SymbolTableCollection &symbolTable) {
       symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr()));
 }
 
+IFuncOp AddressOfOp::getIFunc(SymbolTableCollection &symbolTable) {
+  return dyn_cast_or_null<IFuncOp>(
+      symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr()));
+}
+
 LogicalResult
 AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   Operation *symbol =
@@ -2060,10 +2071,11 @@ AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   auto global = dyn_cast_or_null<GlobalOp>(symbol);
   auto function = dyn_cast_or_null<LLVMFuncOp>(symbol);
   auto alias = dyn_cast_or_null<AliasOp>(symbol);
+  auto ifunc = dyn_cast_or_null<IFuncOp>(symbol);
 
-  if (!global && !function && !alias)
+  if (!global && !function && !alias && !ifunc)
     return emitOpError("must reference a global defined by 'llvm.mlir.global', "
-                       "'llvm.mlir.alias' or 'llvm.func'");
+                       "'llvm.mlir.alias' or 'llvm.func' or 'llvm.mlir.ifunc'");
 
   LLVMPointerType type = getType();
   if ((global && global.getAddrSpace() != type.getAddressSpace()) ||
@@ -2673,6 +2685,69 @@ unsigned AliasOp::getAddrSpace() {
   return ptrTy.getAddressSpace();
 }
 
+//===----------------------------------------------------------------------===//
+// IFuncOp
+//===----------------------------------------------------------------------===//
+
+void IFuncOp::build(OpBuilder &builder, OperationState &result, StringRef name,
+                    Type iFuncType, StringRef resolverName, Type resolverType,
+                    Linkage linkage, LLVM::Visibility visibility) {
+  return build(builder, result, name, iFuncType, resolverName, resolverType,
+               linkage, /*dso_local=*/false, /*address_space=*/0,
+               UnnamedAddr::None, visibility);
+}
+
+LogicalResult IFuncOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  Operation *symbol =
+      symbolTable.lookupSymbolIn(parentLLVMModule(*this), getResolverAttr());
+  // This matches LLVM IR verification logic, see llvm/lib/IR/Verifier.cpp
+  auto resolver = dyn_cast<LLVMFuncOp>(symbol);
+  auto alias = dyn_cast<AliasOp>(symbol);
+  while (alias) {
+    Block &initBlock = alias.getInitializerBlock();
+    auto returnOp = cast<ReturnOp>(initBlock.getTerminator());
+    auto addrOp = dyn_cast<AddressOfOp>(returnOp.getArg().getDefiningOp());
+    // FIXME: This is a best effort solution. The AliasOp body might be more
+    // complex and in that case we bail out with success. To completely match
+    // the LLVM IR logic it would be necessary to implement proper alias and
+    // cast stripping.
+    if (!addrOp)
+      return success();
+    resolver = addrOp.getFunction(symbolTable);
+    alias = addrOp.getAlias(symbolTable);
+  }
+  if (!resolver)
+    return emitOpError("must have a function resolver");
+  Linkage linkage = resolver.getLinkage();
+  if (resolver.isExternal() || linkage == Linkage::AvailableExternally)
+    return emitOpError("resolver must be a definition");
+  if (!isa<LLVMPointerType>(resolver.getFunctionType().getReturnType()))
+    return emitOpError("resolver must return a pointer");
+  auto resolverPtr = dyn_cast<LLVMPointerType>(getResolverType());
+  if (!resolverPtr || resolverPtr.getAddressSpace() != getAddressSpace())
+    return emitOpError("resolver has incorrect type");
+  return success();
+}
+
+LogicalResult IFuncOp::verify() {
+  switch (getLinkage()) {
+  case Linkage::External:
+  case Linkage::Internal:
+  case Linkage::Private:
+  case Linkage::Weak:
+  case Linkage::WeakODR:
+  case Linkage::Linkonce:
+  case Linkage::LinkonceODR:
+    break;
+  default:
+    return emitOpError() << "'" << stringifyLinkage(getLinkage())
+                         << "' linkage not supported in ifuncs, available "
+                            "options: private, internal, linkonce, weak, "
+                            "linkonce_odr, weak_odr, or external linkage";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ShuffleVectorOp
 //===----------------------------------------------------------------------===//
@@ -4320,3 +4395,11 @@ bool mlir::LLVM::satisfiesLLVMModule(Operation *op) {
   return op->hasTrait<OpTrait::SymbolTable>() &&
          op->hasTrait<OpTrait::IsIsolatedFromAbove>();
 }
+
+Operation *mlir::LLVM::parentLLVMModule(Operation *op) {
+  Operation *module = op->getParentOp();
+  while (module && !satisfiesLLVMModule(module))
+    module = module->getParentOp();
+  assert(module && "unexpected operation outside of a module");
+  return module;
+}
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 5d5f9de465561..c959310136319 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3920,7 +3920,8 @@ DiagnosedSilenceableFailure transform::VectorizeOp::apply(
     }
     FailureOr<VectorizationResult> vectorResults =
         linalg::vectorize(rewriter, target, vectorSizes, getScalableSizes(),
-                          getVectorizeNdExtract().value_or(false));
+                          getVectorizeNdExtract().value_or(false), false,
+                          getAssumeDynamicDimsMatchVecSizes().value_or(false));
     if (failed(vectorResults)) {
       return mlir::emitSilenceableFailure(target->getLoc())
              << "Attempted to vectorize, but failed";
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index 513cecef29b61..5a10883a6043c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "llvm/Support/Debug.h"
@@ -887,26 +888,55 @@ struct PackOpTiling
 
     ArrayRef<OpFoldResult> offsets(allOffsets[0]);
     ArrayRef<OpFoldResult> sizes(allSizes[0]);
-
     auto packOp = cast<PackOp>(op);
-    // It is not trivial to infer dest tile from source tile if `packOp` has
-    // padding semantic.
-    if (packOp.getPaddingValue())
-      return failure();
-
     Location loc = packOp.getLoc();
-
     SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
     DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
         packOp.getDimAndTileMapping();
     for (auto dim : llvm::seq<int64_t>(packOp.getSourceRank())) {
       if (dimAndTileMapping.count(dim)) {
-        FailureOr<int64_t> cstSize =
+        FailureOr<int64_t> cstTileSize =
             ValueBoundsConstraintSet::computeConstantBound(
                 presburger::BoundType::UB, sizes[dim],
                 /*stopCondition=*/nullptr, /*closedUB=*/true);
         std::optional<int64_t> cstInnerSize =
             getConstantIntValue(dimAndTileMapping[dim]);
+
+        // If a dimension is not tiled, it is always valid to fuse the pack op,
+        // even if the op has padding semantics. Because it always generates a
+        // full slice along the dimension.
+        // TODO: It could be untiled if the `srcDimSize` is dynamic. It is a
+        // hard check to determine if a dimension is tiled or not.
+        int64_t srcDimSize = packOp.getSourceType().getDimSize(dim);
+        int64_t destDimSize = packOp.getDestType().getDimSize(dim);
+        bool isTiled = failed(cstTileSize) ||
+                       ShapedType::isDynamic(srcDimSize) ||
+                       cstTileSize.value() != srcDimSize;
+        if (!isTiled) {
+          outerDimOffsets.push_back(offsets[dim]);
+          if (ShapedType::isStatic(destDimSize)) {
+            outerDimSizes.push_back(b.getIndexAttr(destDimSize));
+          } else {
+            outerDimSizes.push_back(
+                b.createOrFold<tensor::DimOp>(loc, packOp.getDest(), dim));
+          }
+          continue;
+        }
+
+        // If the dimension needs padding, it is not supported because there are
+        // iterations that only write padding values to the whole tile. The
+        // consumer fusion is driven by the source, so it is not possible to map
+        // an empty slice to the tile.
+        bool needExtraPadding =
+            ShapedType::isDynamic(destDimSize) || !cstInnerSize ||
+            destDimSize * cstInnerSize.value() != srcDimSize;
+        // Prioritize the case that the op already says that it does not need
+        // padding.
+        if (!packOp.getPaddingValue())
+          needExtraPadding = false;
+        if (needExtraPadding)
+          return failure();
+
         // Currently fusing `packOp` as consumer only expects perfect tiling
         // scenario because even if without padding semantic, the `packOp` may
         // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>,
@@ -921,9 +951,9 @@ struct PackOpTiling
         // another word, we can only support tiling with consumer if the tile
         // size for the producer is a multiple of the inner tile size for the
         // packed dimensions at this moment.
-        if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) {
+        if ((failed(cstTileSize) || !cstInnerSize ||
+             *cstTileSize % *cstInnerSize != 0))
           return failure();
-        }
 
         using AV = affine::AffineValueExpr;
         affine::AffineBuilder ab(b, loc);
@@ -988,7 +1018,8 @@ struct PackOpTiling
         loc, packOp.getDest(), outputOffsets, outputSizes, strides);
     tiledOperands.push_back(outSlice);
 
-    assert(!packOp.getPaddingValue() && "Expect no padding semantic");
+    if (auto val = packOp.getPaddingValue())
+      tiledOperands.push_back(val);
     for (auto tile : packOp.getInnerTiles())
       tiledOperands.push_back(tile);
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 5a8c5eab3f444..4add50f4b36e5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -219,7 +219,8 @@ struct VectorizationState {
   /// canonical vector shape for vectorization.
   LogicalResult initState(RewriterBase &rewriter, LinalgOp linalgOp,
                           ArrayRef<int64_t> inputVectorSizes,
-                          ArrayRef<bool> inputScalableVecDims);
+                          ArrayRef<bool> inputScalableVecDims,
+                          bool assumeDynamicDimsMatchVecSizes = false);
 
   /// Returns the canonical vector shape used to vectorize the iteration space.
   ArrayRef<int64_t> getCanonicalVecShape() const { return canonicalVecShape; }
@@ -328,6 +329,14 @@ struct VectorizationState {
   /// Global vectorization guard for the incoming rewriter. It's initialized
   /// when the vectorization state is initialized.
   OpBuilder::InsertionGuard rewriterGuard;
+
+  /// Do all dynamic dims match the corresponding vector sizes?
+  ///
+  /// When a dynamic tensor/memref dimension matches the corresponding vector
+  /// dimension, masking can be safely skipped, despite the presence of dynamic
+  /// shapes. Use this flag with care and only for cases where you are
+  /// confident the assumption holds.
+  bool assumeDynamicDimsMatchVecSizes = false;
 };
 
 LogicalResult
@@ -364,10 +373,12 @@ VectorizationState::precomputeIterSpaceValueSizes(RewriterBase &rewriter,
 /// Initializes the vectorization state, including the computation of the
 /// canonical vector shape for vectorization.
 // TODO: Move this to the constructor when we can remove the failure cases.
-LogicalResult
-VectorizationState::initState(RewriterBase &rewriter, LinalgOp linalgOp,
-                              ArrayRef<int64_t> inputVectorSizes,
-                              ArrayRef<bool> inputScalableVecDims) {
+LogicalResult VectorizationState::initState(RewriterBase &rewriter,
+                                            LinalgOp linalgOp,
+                                            ArrayRef<int64_t> inputVectorSizes,
+                                            ArrayRef<bool> inputScalableVecDims,
+                                            bool assumeDimsMatchVec) {
+  assumeDynamicDimsMatchVecSizes = assumeDimsMatchVec;
   // Initialize the insertion point.
   rewriter.setInsertionPoint(linalgOp);
 
@@ -467,6 +478,23 @@ Value VectorizationState::getOrCreateMaskFor(
     return Value();
   }
 
+  if (assumeDynamicDimsMatchVecSizes) {
+    // While for _dynamic_ dim sizes we can _assume_ that the corresponding
+    // vector sizes match, we still need to check the _static_ dim sizes. Only
+    // then we can be 100% sure that masking is not required.
+    if (llvm::all_of(llvm::zip(permutedStaticSizes, maskType.getShape()),
+                     [](auto it) {
+                       return std::get<0>(it) == ShapedType::kDynamic
+                                  ? true
+                                  : std::get<0>(it) == std::get<1>(it);
+                     })) {
+      LDBG("Dynamic + static dimensions match vector sizes, masking is not "
+           "required.\n");
+      activeMaskCache[maskingMap] = Value();
+      return Value();
+    }
+  }
+
   // Permute the iteration space value sizes to compute the mask upper bounds.
   SmallVector<Value> upperBounds =
       applyPermutationMap(maskingMap, ArrayRef<Value>(iterSpaceValueSizes));
@@ -1928,11 +1956,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp,
       unpackOp.getDestType().hasStaticShape()
           ? vectorSizes
           : shapeCastOp.getResultVectorType().getShape());
-  Value dest = rewriter.create<tensor::EmptyOp>(
-      loc, reifiedRetShapes[0],
-      shapeCastOp.getResult().getType().getElementType());
   Operation *write = createWriteOrMaskedWrite(
-      rewriter, loc, shapeCastOp.getResult(), dest,
+      rewriter, loc, shapeCastOp.getResult(), unpackOp.getDest(),
       /*writeIndices=*/{}, useInBoundsInsteadOfMasking);
   newResults.push_back(write->getResult(0));
   return success();
@@ -2472,7 +2497,8 @@ vectorizeScalableVectorPrecondition(Operation *op,
   return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
                  isa<linalg::MatmulTransposeAOp>(op) ||
                  isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
-                 isa<linalg::MatvecOp>(op) || hasReductionIterator(linalgOp));
+                 isa<linalg::MatvecOp>(op) || isa<linalg::Mmt4DOp>(op) ||
+                 hasReductionIterator(linalgOp));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
@@ -2528,11 +2554,10 @@ bool mlir::linalg::hasVectorizationImpl(Operation *op) {
              tensor::InsertSliceOp>(op);
 }
 
-FailureOr<VectorizationResult>
-mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
-                        ArrayRef<int64_t> inputVectorSizes,
-                        ArrayRef<bool> inputScalableVecDims,
-                        bool vectorizeNDExtract, bool flatten1DDepthwiseConv) {
+FailureOr<VectorizationResult> mlir::linalg::vectorize(
+    RewriterBase &rewriter, Operation *op, ArrayRef<int64_t> inputVectorSizes,
+    ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract,
+    bool flatten1DDepthwiseConv, bool assumeDynamicDimsMatchVecSizes) {
   LDBG("Attempting to vectorize:\n" << *op << "\n");
   LDBG("Input vector sizes: ");
   LLVM_DEBUG(llvm::interleaveComma(inputVectorSizes, llvm::dbgs()));
@@ -2552,7 +2577,8 @@ mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
   VectorizationState state(rewriter);
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
     if (failed(state.initState(rewriter, linalgOp, inputVectorSizes,
-                               inputScalableVecDims))) {
+                               inputScalableVecDims,
+                               assumeDynamicDimsMatchVecSizes))) {
       LDBG("Vectorization state couldn't be initialized\n");
       return failure();
     }
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f2eab62b286af..fbc1f003ab648 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/LogicalResult.h"
+#include <variant>
 
 using namespace mlir;
 using namespace acc;
@@ -3461,40 +3462,88 @@ LogicalResult acc::RoutineOp::verify() {
   return success();
 }
 
-static ParseResult parseBindName(OpAsmParser &parser, mlir::ArrayAttr &bindName,
-                                 mlir::ArrayAttr &deviceTypes) {
-  llvm::SmallVector<mlir::Attribute> bindNameAttrs;
-  llvm::SmallVector<mlir::Attribute> deviceTypeAttrs;
+static ParseResult parseBindName(OpAsmParser &parser,
+                                 mlir::ArrayAttr &bindIdName,
+                                 mlir::ArrayAttr &bindStrName,
+                                 mlir::ArrayAttr &deviceIdTypes,
+                                 mlir::ArrayAttr &deviceStrTypes) {
+  llvm::SmallVector<mlir::Attribute> bindIdNameAttrs;
+  llvm::SmallVector<mlir::Attribute> bindStrNameAttrs;
+  llvm::SmallVector<mlir::Attribute> deviceIdTypeAttrs;
+  llvm::SmallVector<mlir::Attribute> deviceStrTypeAttrs;
 
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseAttribute(bindNameAttrs.emplace_back()))
+        mlir::Attribute newAttr;
+        bool isSymbolRefAttr;
+        auto parseResult = parser.parseAttribute(newAttr);
+        if (auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(newAttr)) {
+          bindIdNameAttrs.push_back(symbolRefAttr);
+          isSymbolRefAttr = true;
+        } else if (auto stringAttr = dyn_cast<mlir::StringAttr>(newAttr)) {
+          bindStrNameAttrs.push_back(stringAttr);
+          isSymbolRefAttr = false;
+        }
+        if (parseResult)
           return failure();
         if (failed(parser.parseOptionalLSquare())) {
-          deviceTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
-              parser.getContext(), mlir::acc::DeviceType::None));
+          if (isSymbolRefAttr) {
+            deviceIdTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+                parser.getContext(), mlir::acc::DeviceType::None));
+          } else {
+            deviceStrTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+                parser.getContext(), mlir::acc::DeviceType::None));
+          }
         } else {
-          if (parser.parseAttribute(deviceTypeAttrs.emplace_back()) ||
-              parser.parseRSquare())
-            return failure();
+          if (isSymbolRefAttr) {
+            if (parser.parseAttribute(deviceIdTypeAttrs.emplace_back()) ||
+                parser.parseRSquare())
+              return failure();
+          } else {
+            if (parser.parseAttribute(deviceStrTypeAttrs.emplace_back()) ||
+                parser.parseRSquare())
+              return failure();
+          }
         }
         return success();
       })))
     return failure();
 
-  bindName = ArrayAttr::get(parser.getContext(), bindNameAttrs);
-  deviceTypes = ArrayAttr::get(parser.getContext(), deviceTypeAttrs);
+  bindIdName = ArrayAttr::get(parser.getContext(), bindIdNameAttrs);
+  bindStrName = ArrayAttr::get(parser.getContext(), bindStrNameAttrs);
+  deviceIdTypes = ArrayAttr::get(parser.getContext(), deviceIdTypeAttrs);
+  deviceStrTypes = ArrayAttr::get(parser.getContext(), deviceStrTypeAttrs);
 
   return success();
 }
 
 static void printBindName(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                          std::optional<mlir::ArrayAttr> bindName,
-                          std::optional<mlir::ArrayAttr> deviceTypes) {
-  llvm::interleaveComma(llvm::zip(*bindName, *deviceTypes), p,
-                        [&](const auto &pair) {
-                          p << std::get<0>(pair);
-                          printSingleDeviceType(p, std::get<1>(pair));
-                        });
+                          std::optional<mlir::ArrayAttr> bindIdName,
+                          std::optional<mlir::ArrayAttr> bindStrName,
+                          std::optional<mlir::ArrayAttr> deviceIdTypes,
+                          std::optional<mlir::ArrayAttr> deviceStrTypes) {
+  // Create combined vectors for all bind names and device types
+  llvm::SmallVector<mlir::Attribute> allBindNames;
+  llvm::SmallVector<mlir::Attribute> allDeviceTypes;
+
+  // Append bindIdName and deviceIdTypes
+  if (hasDeviceTypeValues(deviceIdTypes)) {
+    allBindNames.append(bindIdName->begin(), bindIdName->end());
+    allDeviceTypes.append(deviceIdTypes->begin(), deviceIdTypes->end());
+  }
+
+  // Append bindStrName and deviceStrTypes
+  if (hasDeviceTypeValues(deviceStrTypes)) {
+    allBindNames.append(bindStrName->begin(), bindStrName->end());
+    allDeviceTypes.append(deviceStrTypes->begin(), deviceStrTypes->end());
+  }
+
+  // Print the combined sequence
+  if (!allBindNames.empty())
+    llvm::interleaveComma(llvm::zip(allBindNames, allDeviceTypes), p,
+                          [&](const auto &pair) {
+                            p << std::get<0>(pair);
+                            printSingleDeviceType(p, std::get<1>(pair));
+                          });
 }
 
 static ParseResult parseRoutineGangClause(OpAsmParser &parser,
@@ -3654,19 +3703,32 @@ bool RoutineOp::hasSeq(mlir::acc::DeviceType deviceType) {
   return hasDeviceType(getSeq(), deviceType);
 }
 
-std::optional<llvm::StringRef> RoutineOp::getBindNameValue() {
+std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
+RoutineOp::getBindNameValue() {
   return getBindNameValue(mlir::acc::DeviceType::None);
 }
 
-std::optional<llvm::StringRef>
+std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
 RoutineOp::getBindNameValue(mlir::acc::DeviceType deviceType) {
-  if (!hasDeviceTypeValues(getBindNameDeviceType()))
+  if (!hasDeviceTypeValues(getBindIdNameDeviceType()) &&
+      !hasDeviceTypeValues(getBindStrNameDeviceType())) {
     return std::nullopt;
-  if (auto pos = findSegment(*getBindNameDeviceType(), deviceType)) {
-    auto attr = (*getBindName())[*pos];
+  }
+
+  if (auto pos = findSegment(*getBindIdNameDeviceType(), deviceType)) {
+    auto attr = (*getBindIdName())[*pos];
+    auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(attr);
+    assert(symbolRefAttr && "expected SymbolRef");
+    return symbolRefAttr;
+  }
+
+  if (auto pos = findSegment(*getBindStrNameDeviceType(), deviceType)) {
+    auto attr = (*getBindStrName())[*pos];
     auto stringAttr = dyn_cast<mlir::StringAttr>(attr);
-    return stringAttr.getValue();
+    assert(stringAttr && "expected String");
+    return stringAttr;
   }
+
   return std::nullopt;
 }
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 9bea8b7a732a8..769aee64e1695 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/ADT/bit.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 #include <cstddef>
@@ -3863,6 +3864,20 @@ LogicalResult ScanOp::verify() {
                    "reduction modifier");
 }
 
+/// Verifies align clause in allocate directive
+
+LogicalResult AllocateDirOp::verify() {
+  std::optional<uint64_t> align = this->getAlign();
+
+  if (align.has_value()) {
+    if ((align.value() > 0) && !llvm::has_single_bit(align.value()))
+      return emitError() << "ALIGN value : " << align.value()
+                         << " must be power of 2";
+  }
+
+  return success();
+}
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc"
 
diff --git a/mlir/lib/Dialect/PDL/IR/PDL.cpp b/mlir/lib/Dialect/PDL/IR/PDL.cpp
index d3f7c9798b9b8..8af93335ca96c 100644
--- a/mlir/lib/Dialect/PDL/IR/PDL.cpp
+++ b/mlir/lib/Dialect/PDL/IR/PDL.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Dialect/PDL/IR/PDLTypes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include <optional>
 
diff --git a/mlir/lib/Dialect/PDLInterp/IR/PDLInterp.cpp b/mlir/lib/Dialect/PDLInterp/IR/PDLInterp.cpp
index 9b1f11d835282..1c713eb6f467c 100644
--- a/mlir/lib/Dialect/PDLInterp/IR/PDLInterp.cpp
+++ b/mlir/lib/Dialect/PDLInterp/IR/PDLInterp.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
 #include "mlir/Dialect/PDL/IR/PDLTypes.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/DialectImplementation.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp b/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
index b819f56841852..772d25d8a31ee 100644
--- a/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
+++ b/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
@@ -11,8 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Ptr/IR/PtrAttrs.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 using namespace mlir::ptr;
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
index c488144508128..b44dbfd98a15e 100644
--- a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
+++ b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp
@@ -13,10 +13,8 @@
 #include "mlir/Dialect/Ptr/IR/PtrOps.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Transforms/InliningUtils.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp b/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
index 7ad2a6bc4c80b..689de15a0415e 100644
--- a/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
+++ b/mlir/lib/Dialect/Ptr/IR/PtrTypes.cpp
@@ -12,7 +12,6 @@
 
 #include "mlir/Dialect/Ptr/IR/PtrTypes.h"
 #include "mlir/Dialect/Ptr/IR/PtrAttrs.h"
-#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 using namespace mlir::ptr;
diff --git a/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp b/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp
index 44ec0c517d561..1b3cc5a43f460 100644
--- a/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp
+++ b/mlir/lib/Dialect/Quant/IR/QuantDialectBytecode.cpp
@@ -13,8 +13,6 @@
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp b/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
index 9b8eec609b039..8122c4db684e5 100644
--- a/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
+++ b/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
@@ -12,9 +12,6 @@
 
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace mlir;
 using namespace mlir::quant;
diff --git a/mlir/lib/Dialect/Quant/Transforms/NormalizeQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/NormalizeQuantTypes.cpp
index 030cf07794377..1c3971d262042 100644
--- a/mlir/lib/Dialect/Quant/Transforms/NormalizeQuantTypes.cpp
+++ b/mlir/lib/Dialect/Quant/Transforms/NormalizeQuantTypes.cpp
@@ -16,7 +16,6 @@
 #include "mlir/Dialect/Quant/IR/Quant.h"
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/Dialect/Quant/Transforms/Passes.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
index 4009faa21576d..920b6ecb01d47 100644
--- a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
+++ b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp
@@ -10,16 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Quant/IR/Quant.h"
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"
 #include "mlir/Dialect/Quant/Transforms/Passes.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 5a3bd984530db..00c31a1500e17 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -25,7 +25,6 @@
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 using namespace mlir::scf;
diff --git a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
index 57c27231f2144..181a29ef70e82 100644
--- a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
+++ b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
@@ -11,15 +11,12 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
index 24fbc1dca8361..a44612410bdee 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.h"
 #include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index d36d91249ed36..99d021081f866 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Operation.h"
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
index b71ec985fa6a1..d17cd47b9043a 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
@@ -24,7 +24,6 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/DenseMap.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_SCFFORLOOPPEELING
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
index 358a3b38a4cd3..0f00806870d8a 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
@@ -11,9 +11,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
-#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
index 29d6d2574a2be..7e9a4d71251b1 100644
--- a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
@@ -10,8 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/SCF/Transforms/Passes.h"
-
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
diff --git a/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp b/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp
index 89dfb61d48af9..b73554cde74c3 100644
--- a/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/AffineCanonicalizationUtils.cpp
@@ -20,9 +20,7 @@
 #include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "mlir-scf-affine-utils"
 
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index f4047be68ccf2..2d2cc9f074d06 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -17,15 +17,12 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
diff --git a/mlir/lib/Dialect/SMT/IR/SMTAttributes.cpp b/mlir/lib/Dialect/SMT/IR/SMTAttributes.cpp
index 3f40d6a42eafd..20d88a9d6872e 100644
--- a/mlir/lib/Dialect/SMT/IR/SMTAttributes.cpp
+++ b/mlir/lib/Dialect/SMT/IR/SMTAttributes.cpp
@@ -12,7 +12,6 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/Format.h"
 
 using namespace mlir;
 using namespace mlir::smt;
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
index 7042f12ba00f8..656236246b1ad 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -763,6 +763,44 @@ void mlir::spirv::AddressOfOp::getAsmResultNames(
   setNameFn(getResult(), specialName.str());
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.EXTConstantCompositeReplicate
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::EXTConstantCompositeReplicateOp::verify() {
+  Type valueType;
+  if (auto typedAttr = dyn_cast<TypedAttr>(getValue())) {
+    valueType = typedAttr.getType();
+  } else if (auto arrayAttr = dyn_cast<ArrayAttr>(getValue())) {
+    auto typedElemAttr = dyn_cast<TypedAttr>(arrayAttr[0]);
+    if (!typedElemAttr)
+      return emitError("value attribute is not typed");
+    valueType =
+        spirv::ArrayType::get(typedElemAttr.getType(), arrayAttr.size());
+  } else {
+    return emitError("unknown value attribute type");
+  }
+
+  auto compositeType = dyn_cast<spirv::CompositeType>(getType());
+  if (!compositeType)
+    return emitError("result type is not a composite type");
+
+  Type compositeElementType = compositeType.getElementType(0);
+
+  SmallVector<Type, 3> possibleTypes = {compositeElementType};
+  while (auto type = dyn_cast<spirv::CompositeType>(compositeElementType)) {
+    compositeElementType = type.getElementType(0);
+    possibleTypes.push_back(compositeElementType);
+  }
+
+  if (!is_contained(possibleTypes, valueType)) {
+    return emitError("expected value attribute type ")
+           << interleaved(possibleTypes, " or ") << ", but got: " << valueType;
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.ControlBarrierOp
 //===----------------------------------------------------------------------===//
@@ -1864,6 +1902,69 @@ LogicalResult spirv::SpecConstantCompositeOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.EXTSpecConstantCompositeReplicateOp
+//===----------------------------------------------------------------------===//
+
+ParseResult
+spirv::EXTSpecConstantCompositeReplicateOp::parse(OpAsmParser &parser,
+                                                  OperationState &result) {
+  StringAttr compositeName;
+  FlatSymbolRefAttr specConstRef;
+  const char *attrName = "spec_const";
+  NamedAttrList attrs;
+  Type type;
+
+  if (parser.parseSymbolName(compositeName, SymbolTable::getSymbolAttrName(),
+                             result.attributes) ||
+      parser.parseLParen() ||
+      parser.parseAttribute(specConstRef, Type(), attrName, attrs) ||
+      parser.parseRParen() || parser.parseColonType(type))
+    return failure();
+
+  StringAttr compositeSpecConstituentName =
+      spirv::EXTSpecConstantCompositeReplicateOp::getConstituentAttrName(
+          result.name);
+  result.addAttribute(compositeSpecConstituentName, specConstRef);
+
+  StringAttr typeAttrName =
+      spirv::EXTSpecConstantCompositeReplicateOp::getTypeAttrName(result.name);
+  result.addAttribute(typeAttrName, TypeAttr::get(type));
+
+  return success();
+}
+
+void spirv::EXTSpecConstantCompositeReplicateOp::print(OpAsmPrinter &printer) {
+  printer << " ";
+  printer.printSymbolName(getSymName());
+  printer << " (" << this->getConstituent() << ") : " << getType();
+}
+
+LogicalResult spirv::EXTSpecConstantCompositeReplicateOp::verify() {
+  auto compositeType = dyn_cast<spirv::CompositeType>(getType());
+  if (!compositeType)
+    return emitError("result type must be a composite type, but provided ")
+           << getType();
+
+  Operation *constituentOp = SymbolTable::lookupNearestSymbolFrom(
+      (*this)->getParentOp(), this->getConstituent());
+  if (!constituentOp)
+    return emitError(
+        "splat spec constant reference defining constituent not found");
+
+  auto constituentSpecConstOp = dyn_cast<spirv::SpecConstantOp>(constituentOp);
+  if (!constituentSpecConstOp)
+    return emitError("constituent is not a spec constant");
+
+  Type constituentType = constituentSpecConstOp.getDefaultValue().getType();
+  Type compositeElementType = compositeType.getElementType(0);
+  if (constituentType != compositeElementType)
+    return emitError("constituent has incorrect type: expected ")
+           << compositeElementType << ", but provided " << constituentType;
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.SpecConstantOperation
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 5f395eef9d601..7805599738749 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -25,7 +25,6 @@
 #include "mlir/Interfaces/FunctionImplementation.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
index 8a471c12d21e4..23a4266d6b780 100644
--- a/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 
diff --git a/mlir/lib/Dialect/Shape/Transforms/OutlineShapeComputation.cpp b/mlir/lib/Dialect/Shape/Transforms/OutlineShapeComputation.cpp
index ae06a34b65709..0fe10728b3045 100644
--- a/mlir/lib/Dialect/Shape/Transforms/OutlineShapeComputation.cpp
+++ b/mlir/lib/Dialect/Shape/Transforms/OutlineShapeComputation.cpp
@@ -12,14 +12,11 @@
 #include "mlir/Dialect/Shape/Transforms/Passes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Debug.h"
 #include <queue>
-#include <unordered_set>
 #include <vector>
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp
index 121e0cc133e19..d83ceab72bba9 100644
--- a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp
+++ b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index 2dd45d27157cb..5758d8d5ef506 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -1295,7 +1295,8 @@ OpFoldResult CastOp::fold(FoldAdaptor adaptor) {
     }
 
     if (llvm::isa<IntegerType>(inETy) && llvm::isa<IntegerType>(outETy)) {
-      auto unsignIn = llvm::cast<IntegerType>(inETy).isUnsignedInteger();
+      const auto inIntType = llvm::cast<IntegerType>(inETy);
+      auto unsignIn = inIntType.isUnsignedInteger();
       bool trunc =
           inETy.getIntOrFloatBitWidth() > outETy.getIntOrFloatBitWidth();
       auto intVal = operand.getSplatValue<APInt>();
@@ -1303,7 +1304,8 @@ OpFoldResult CastOp::fold(FoldAdaptor adaptor) {
 
       if (trunc) {
         intVal = intVal.trunc(bitwidth);
-      } else if (unsignIn) {
+        // i1 types are boolean in TOSA
+      } else if (unsignIn || inIntType.isInteger(1)) {
         intVal = intVal.zext(bitwidth);
       } else {
         intVal = intVal.sext(bitwidth);
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 4a952ac062cad..f0ff430bae882 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -3417,7 +3417,8 @@ LogicalResult TransposeConv2DOp::verify() {
     return success();
 
   const int64_t outputChannels = outputType.getDimSize(3);
-  if (biasChannels != outputChannels && biasChannels != 1)
+  if (!ShapedType::isDynamic(outputChannels) &&
+      biasChannels != outputChannels && biasChannels != 1)
     return emitOpError(
                "bias channels expected to be equal to output channels (")
            << outputChannels << ") or 1, got " << biasChannels;
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index a4edccfd4c9c7..88b0f3650ca01 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -225,20 +225,6 @@ LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::VariableWriteOp op) {
   return success();
 }
 
-template <>
-LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::IfOp op) {
-  addValue(op.getCondition());
-  return success();
-}
-
-template <>
-LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::WhileOp op) {
-  Block *block = &op.getCondGraph().front();
-  Operation *terminator = block->getTerminator();
-  addValue(terminator->getOperands().front());
-  return success();
-}
-
 LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 // This helper function only populates the info for the customised operands.
 #define POPULATE_PROFILE_INFO_CUSTOM(tosaOp)                                   \
@@ -280,8 +266,6 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
   POPULATE_PROFILE_INFO_CUSTOM(MatMul)
   POPULATE_PROFILE_INFO_CUSTOM(Variable)
   POPULATE_PROFILE_INFO_CUSTOM(VariableWrite)
-  POPULATE_PROFILE_INFO_CUSTOM(If)
-  POPULATE_PROFILE_INFO_CUSTOM(While)
 
   // For the most of tosa operators, all operands are profile/extension related
   // and hence are all considered in this profile-based compilance check.
@@ -340,6 +324,8 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
   // constraint for those operations.
   POPULATE_PROFILE_INFO_SKIP(ConstShape)
   POPULATE_PROFILE_INFO_SKIP(Yield)
+  POPULATE_PROFILE_INFO_SKIP(If)
+  POPULATE_PROFILE_INFO_SKIP(While)
 
   return failure();
 }
diff --git a/mlir/lib/Dialect/Traits.cpp b/mlir/lib/Dialect/Traits.cpp
index 2d43a1e52a0b1..6e0a4eccb007c 100644
--- a/mlir/lib/Dialect/Traits.cpp
+++ b/mlir/lib/Dialect/Traits.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Dialect/Traits.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "llvm/Support/FormatVariadic.h"
 #include <optional>
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index a32732e8b7007..bc85cf4231ba5 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -13,7 +13,6 @@
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/LogicalResult.h"
 
 #include <numeric>
 #include <optional>
diff --git a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
index 88f02369cb7ab..7b9e51a01a946 100644
--- a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
+++ b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 
 namespace mlir::vector {
 
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index b58ed6b5d1144..7d615bfc12984 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -47,7 +47,6 @@
 
 #include <cassert>
 #include <cstdint>
-#include <numeric>
 
 #include "mlir/Dialect/Vector/IR/VectorDialect.cpp.inc"
 // Pull in all enum type and utility function definitions.
@@ -4238,28 +4237,35 @@ class StridedSliceBroadcast final
     auto dstVecType = llvm::cast<VectorType>(op.getType());
     unsigned dstRank = dstVecType.getRank();
     unsigned rankDiff = dstRank - srcRank;
-    // Check if the most inner dimensions of the source of the broadcast are the
-    // same as the destination of the extract. If this is the case we can just
-    // use a broadcast as the original dimensions are untouched.
-    bool lowerDimMatch = true;
+    // Source dimensions can be broadcasted (1 -> n with n > 1) or sliced
+    // (n -> m with n > m). If they are originally both broadcasted *and*
+    // sliced, this can be simplified to just broadcasting.
+    bool needsSlice = false;
     for (unsigned i = 0; i < srcRank; i++) {
-      if (srcVecType.getDimSize(i) != dstVecType.getDimSize(i + rankDiff)) {
-        lowerDimMatch = false;
+      if (srcVecType.getDimSize(i) != 1 &&
+          srcVecType.getDimSize(i) != dstVecType.getDimSize(i + rankDiff)) {
+        needsSlice = true;
         break;
       }
     }
     Value source = broadcast.getSource();
-    // If the inner dimensions don't match, it means we need to extract from the
-    // source of the orignal broadcast and then broadcast the extracted value.
-    // We also need to handle degenerated cases where the source is effectively
-    // just a single scalar.
-    bool isScalarSrc = (srcRank == 0 || srcVecType.getNumElements() == 1);
-    if (!lowerDimMatch && !isScalarSrc) {
+    if (needsSlice) {
+      SmallVector<int64_t> offsets =
+          getI64SubArray(op.getOffsets(), /*dropFront=*/rankDiff);
+      SmallVector<int64_t> sizes =
+          getI64SubArray(op.getSizes(), /*dropFront=*/rankDiff);
+      for (unsigned i = 0; i < srcRank; i++) {
+        if (srcVecType.getDimSize(i) == 1) {
+          // In case this dimension was broadcasted *and* sliced, the offset
+          // and size need to be updated now that there is no broadcast before
+          // the slice.
+          offsets[i] = 0;
+          sizes[i] = 1;
+        }
+      }
       source = rewriter.create<ExtractStridedSliceOp>(
-          op->getLoc(), source,
-          getI64SubArray(op.getOffsets(), /* dropFront=*/rankDiff),
-          getI64SubArray(op.getSizes(), /* dropFront=*/rankDiff),
-          getI64SubArray(op.getStrides(), /* dropFront=*/rankDiff));
+          op->getLoc(), source, offsets, sizes,
+          getI64SubArray(op.getStrides(), /*dropFront=*/rankDiff));
     }
     rewriter.replaceOpWithNewOp<BroadcastOp>(op, op.getType(), source);
     return success();
diff --git a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
index 125c3d918284c..2d5cc070558c3 100644
--- a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
+++ b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
@@ -13,14 +13,12 @@
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
 using namespace mlir::vector;
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
index c6627b5ec0d77..0cd3f786d8101 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -11,26 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
-#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/VectorInterfaces.h"
 
 #define DEBUG_TYPE "vector-contract-lowering"
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
index 3000204c8ce17..f4ad56b4178db 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
@@ -11,26 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
-#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/VectorInterfaces.h"
 
 #define DEBUG_TYPE "vector-broadcast-lowering"
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
index a1f67bd0e9ed3..6f3955f522775 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
@@ -11,26 +11,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
-#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/VectorInterfaces.h"
 
 #define DEBUG_TYPE "vector-broadcast-lowering"
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
index 5b81d0d33d484..fb040bc51a993 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp
@@ -11,11 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
-#include "mlir/Interfaces/VectorInterfaces.h"
 
 using namespace mlir;
 using namespace mlir::vector;
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
index 732e316c93381..1fac967e4a2f9 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index c8566b1ff83ef..436029c31e7f8 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1704,19 +1704,18 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
       : WarpDistributionPattern(ctx, b), distributionMapFn(std::move(fn)) {}
   LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    auto yield = cast<gpu::YieldOp>(
+    auto warpOpYield = cast<gpu::YieldOp>(
         warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
-    // Only pick up forOp if it is the last op in the region.
-    Operation *lastNode = yield->getPrevNode();
+    // Only pick up `ForOp` if it is the last op in the region.
+    Operation *lastNode = warpOpYield->getPrevNode();
     auto forOp = dyn_cast_or_null<scf::ForOp>(lastNode);
     if (!forOp)
       return failure();
-    // Collect Values that come from the warp op but are outside the forOp.
-    // Those Value needs to be returned by the original warpOp and passed to
-    // the new op.
+    // Collect Values that come from the `WarpOp` but are outside the `ForOp`.
+    // Those Values need to be returned by the new warp op.
     llvm::SmallSetVector<Value, 32> escapingValues;
-    SmallVector<Type> inputTypes;
-    SmallVector<Type> distTypes;
+    SmallVector<Type> escapingValueInputTypes;
+    SmallVector<Type> escapingValueDistTypes;
     mlir::visitUsedValuesDefinedAbove(
         forOp.getBodyRegion(), [&](OpOperand *operand) {
           Operation *parent = operand->get().getParentRegion()->getParentOp();
@@ -1728,81 +1727,168 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
               AffineMap map = distributionMapFn(operand->get());
               distType = getDistributedType(vecType, map, warpOp.getWarpSize());
             }
-            inputTypes.push_back(operand->get().getType());
-            distTypes.push_back(distType);
+            escapingValueInputTypes.push_back(operand->get().getType());
+            escapingValueDistTypes.push_back(distType);
           }
         });
 
-    if (llvm::is_contained(distTypes, Type{}))
+    if (llvm::is_contained(escapingValueDistTypes, Type{}))
       return failure();
-
-    SmallVector<size_t> newRetIndices;
-    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, escapingValues.getArrayRef(), distTypes,
-        newRetIndices);
-    yield = cast<gpu::YieldOp>(
-        newWarpOp.getBodyRegion().getBlocks().begin()->getTerminator());
-
-    SmallVector<Value> newOperands;
-    SmallVector<unsigned> resultIdx;
-    // Collect all the outputs coming from the forOp.
-    for (OpOperand &yieldOperand : yield->getOpOperands()) {
-      if (yieldOperand.get().getDefiningOp() != forOp.getOperation())
+    // `WarpOp` can yield two types of values:
+    // 1. Values that are not results of the `ForOp`:
+    //    These values must also be yielded by the new `WarpOp`. Also, we need
+    //    to record the index mapping for these values to replace them later.
+    // 2. Values that are results of the `ForOp`:
+    //    In this case, we record the index mapping between the `WarpOp` result
+    //    index and matching `ForOp` result index.
+    // Additionally, we keep track of the distributed types for all `ForOp`
+    // vector results.
+    SmallVector<Value> nonForYieldedValues;
+    SmallVector<unsigned> nonForResultIndices;
+    llvm::SmallDenseMap<unsigned, unsigned> forResultMapping;
+    llvm::SmallDenseMap<unsigned, VectorType> forResultDistTypes;
+    for (OpOperand &yieldOperand : warpOpYield->getOpOperands()) {
+      // Yielded value is not a result of the forOp.
+      if (yieldOperand.get().getDefiningOp() != forOp.getOperation()) {
+        nonForYieldedValues.push_back(yieldOperand.get());
+        nonForResultIndices.push_back(yieldOperand.getOperandNumber());
         continue;
-      auto forResult = cast<OpResult>(yieldOperand.get());
-      newOperands.push_back(
-          newWarpOp.getResult(yieldOperand.getOperandNumber()));
-      yieldOperand.set(forOp.getInitArgs()[forResult.getResultNumber()]);
-      resultIdx.push_back(yieldOperand.getOperandNumber());
+      }
+      OpResult forResult = cast<OpResult>(yieldOperand.get());
+      unsigned int forResultNumber = forResult.getResultNumber();
+      forResultMapping[yieldOperand.getOperandNumber()] = forResultNumber;
+      // If this `ForOp` result is vector type and it is yielded by the
+      // `WarpOp`, we keep track the distributed type for this result.
+      if (!isa<VectorType>(forResult.getType()))
+        continue;
+      VectorType distType = cast<VectorType>(
+          warpOp.getResult(yieldOperand.getOperandNumber()).getType());
+      forResultDistTypes[forResultNumber] = distType;
     }
 
+    // Newly created `WarpOp` will yield values in following order:
+    // 1. All init args of the `ForOp`.
+    // 2. All escaping values.
+    // 3. All non-`ForOp` yielded values.
+    SmallVector<Value> newWarpOpYieldValues;
+    SmallVector<Type> newWarpOpDistTypes;
+    for (auto [i, initArg] : llvm::enumerate(forOp.getInitArgs())) {
+      newWarpOpYieldValues.push_back(initArg);
+      // Compute the distributed type for this init arg.
+      Type distType = initArg.getType();
+      if (auto vecType = dyn_cast<VectorType>(distType)) {
+        // If the `ForOp` result corresponds to this init arg is already yielded
+        // we can get the distributed type from `forResultDistTypes` map.
+        // Otherwise, we compute it using distributionMapFn.
+        AffineMap map = distributionMapFn(initArg);
+        distType = forResultDistTypes.count(i)
+                       ? forResultDistTypes[i]
+                       : getDistributedType(vecType, map, warpOp.getWarpSize());
+      }
+      newWarpOpDistTypes.push_back(distType);
+    }
+    // Insert escaping values and their distributed types.
+    newWarpOpYieldValues.insert(newWarpOpYieldValues.end(),
+                                escapingValues.begin(), escapingValues.end());
+    newWarpOpDistTypes.insert(newWarpOpDistTypes.end(),
+                              escapingValueDistTypes.begin(),
+                              escapingValueDistTypes.end());
+    // Next, we insert all non-`ForOp` yielded values and their distributed
+    // types. We also create a mapping between the non-`ForOp` yielded value
+    // index and the corresponding new `WarpOp` yield value index (needed to
+    // update users later).
+    llvm::SmallDenseMap<unsigned, unsigned> nonForResultMapping;
+    for (auto [i, v] :
+         llvm::zip_equal(nonForResultIndices, nonForYieldedValues)) {
+      nonForResultMapping[i] = newWarpOpYieldValues.size();
+      newWarpOpYieldValues.push_back(v);
+      newWarpOpDistTypes.push_back(warpOp.getResult(i).getType());
+    }
+    // Create the new `WarpOp` with the updated yield values and types.
+    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
+        rewriter, warpOp, newWarpOpYieldValues, newWarpOpDistTypes);
+
+    // Next, we create a new `ForOp` with the init args yielded by the new
+    // `WarpOp`.
+    const unsigned escapingValuesStartIdx =
+        forOp.getInitArgs().size(); // `ForOp` init args are positioned before
+                                    // escaping values in the new `WarpOp`.
+    SmallVector<Value> newForOpOperands;
+    for (size_t i = 0; i < escapingValuesStartIdx; ++i)
+      newForOpOperands.push_back(newWarpOp.getResult(i));
+
+    // Create a new `ForOp` outside the new `WarpOp` region.
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(newWarpOp);
-
-    // Create a new for op outside the region with a WarpExecuteOnLane0Op
-    // region inside.
     auto newForOp = rewriter.create<scf::ForOp>(
         forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
-        forOp.getStep(), newOperands);
+        forOp.getStep(), newForOpOperands);
+    // Next, we insert a new `WarpOp` (called inner `WarpOp`) inside the
+    // newly created `ForOp`. This `WarpOp` will contain all ops that were
+    // contained within the original `ForOp` body.
     rewriter.setInsertionPointToStart(newForOp.getBody());
 
-    SmallVector<Value> warpInput(newForOp.getRegionIterArgs().begin(),
-                                 newForOp.getRegionIterArgs().end());
-    SmallVector<Type> warpInputType(forOp.getResultTypes().begin(),
-                                    forOp.getResultTypes().end());
+    SmallVector<Value> innerWarpInput(newForOp.getRegionIterArgs().begin(),
+                                      newForOp.getRegionIterArgs().end());
+    SmallVector<Type> innerWarpInputType(forOp.getResultTypes().begin(),
+                                         forOp.getResultTypes().end());
+    // Escaping values are forwarded to the inner `WarpOp` as its (additional)
+    // arguments. We keep track of the mapping between these values and their
+    // argument index in the inner `WarpOp` (to replace users later).
     llvm::SmallDenseMap<Value, int64_t> argIndexMapping;
-    for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) {
-      warpInput.push_back(newWarpOp.getResult(retIdx));
-      argIndexMapping[escapingValues[i]] = warpInputType.size();
-      warpInputType.push_back(inputTypes[i]);
+    for (size_t i = escapingValuesStartIdx;
+         i < escapingValuesStartIdx + escapingValues.size(); ++i) {
+      innerWarpInput.push_back(newWarpOp.getResult(i));
+      argIndexMapping[escapingValues[i - escapingValuesStartIdx]] =
+          innerWarpInputType.size();
+      innerWarpInputType.push_back(
+          escapingValueInputTypes[i - escapingValuesStartIdx]);
     }
+    // Create the inner `WarpOp` with the new input values and types.
     auto innerWarp = rewriter.create<WarpExecuteOnLane0Op>(
         newWarpOp.getLoc(), newForOp.getResultTypes(), newWarpOp.getLaneid(),
-        newWarpOp.getWarpSize(), warpInput, warpInputType);
+        newWarpOp.getWarpSize(), innerWarpInput, innerWarpInputType);
 
+    // Inline the `ForOp` body into the inner `WarpOp` body.
     SmallVector<Value> argMapping;
     argMapping.push_back(newForOp.getInductionVar());
-    for (Value args : innerWarp.getBody()->getArguments()) {
+    for (Value args : innerWarp.getBody()->getArguments())
       argMapping.push_back(args);
-    }
+
     argMapping.resize(forOp.getBody()->getNumArguments());
     SmallVector<Value> yieldOperands;
     for (Value operand : forOp.getBody()->getTerminator()->getOperands())
       yieldOperands.push_back(operand);
+
     rewriter.eraseOp(forOp.getBody()->getTerminator());
     rewriter.mergeBlocks(forOp.getBody(), innerWarp.getBody(), argMapping);
+
+    // Insert a gpu `YieldOp` at the end of the inner `WarpOp` body that yields
+    // original `ForOp` results.
     rewriter.setInsertionPointToEnd(innerWarp.getBody());
     rewriter.create<gpu::YieldOp>(innerWarp.getLoc(), yieldOperands);
     rewriter.setInsertionPointAfter(innerWarp);
+    // Insert a scf.yield op at the end of the new `ForOp` body that yields
+    // the inner `WarpOp` results.
     if (!innerWarp.getResults().empty())
       rewriter.create<scf::YieldOp>(forOp.getLoc(), innerWarp.getResults());
+
+    // Update the users of original `WarpOp` results that were coming from the
+    // original `ForOp` to the corresponding new `ForOp` result.
+    for (auto [origIdx, newIdx] : forResultMapping)
+      rewriter.replaceAllUsesExcept(warpOp.getResult(origIdx),
+                                    newForOp.getResult(newIdx), newForOp);
+    // Similarly, update any users of the `WarpOp` results that were not
+    // results of the `ForOp`.
+    for (auto [origIdx, newIdx] : nonForResultMapping)
+      rewriter.replaceAllUsesWith(warpOp.getResult(origIdx),
+                                  newWarpOp.getResult(newIdx));
+    // Remove the original `WarpOp` and `ForOp`, they should not have any uses
+    // at this point.
     rewriter.eraseOp(forOp);
-    // Replace the warpOp result coming from the original ForOp.
-    for (const auto &res : llvm::enumerate(resultIdx)) {
-      rewriter.replaceAllUsesWith(newWarpOp.getResult(res.value()),
-                                  newForOp.getResult(res.index()));
-      newForOp->setOperand(res.index() + 3, newWarpOp.getResult(res.value()));
-    }
+    rewriter.eraseOp(warpOp);
+    // Update any users of escaping values that were forwarded to the
+    // inner `WarpOp`. These values are now arguments of the inner `WarpOp`.
     newForOp.walk([&](Operation *op) {
       for (OpOperand &operand : op->getOpOperands()) {
         auto it = argIndexMapping.find(operand.get());
@@ -1812,7 +1898,7 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
       }
     });
 
-    // Finally, hoist out any now uniform code from the inner warp op.
+    // Finally, hoist out any now uniform code from the inner `WarpOp`.
     mlir::vector::moveScalarUniformCode(innerWarp);
     return success();
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
index 363108238e596..a7403250a069b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
-#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
index 256c8cb69b1ba..eee090d495c17 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <optional>
-#include <type_traits>
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -22,16 +21,10 @@
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "vector-transfer-split"
 
diff --git a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
index 179e6ee8784e6..7de32f7cbfb8b 100644
--- a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
+++ b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
@@ -12,11 +12,8 @@
 
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
 
 using namespace mlir;
 
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp b/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp
index 63f725084cabf..87f7867fe1b7c 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp
@@ -17,7 +17,6 @@
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
index 8e062488f58c8..ce64a54871472 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
@@ -10,9 +10,7 @@
 
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/PatternMatch.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 6b323e35ac622..642c393cbc2c8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -12,7 +12,6 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include <numeric>
 
 using std::optional;
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index ef7cd1424e7a4..78cbf884a1911 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 
 #include "llvm/Support/Debug.h"
 
@@ -112,6 +113,68 @@ isValidGatherScatterParams(Type maskTy, VectorType valueTy,
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<MemRefType> source) {
+  [[maybe_unused]] auto ty = source.getType();
+  assert(ty.hasStaticShape() && "expecting a memref with static shape");
+
+  build(builder, state, tdesc, source, ValueRange({}) /* dynamic offsets */,
+        ValueRange({}) /* empty dynamic shape */,
+        ValueRange({}) /* empty dynamic strides */,
+        DenseI64ArrayAttr({}) /* const offsets */,
+        DenseI64ArrayAttr({}) /* empty const shape*/,
+        DenseI64ArrayAttr({}) /* empty const strides*/);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<MemRefType> source,
+                           llvm::ArrayRef<OpFoldResult> shape,
+                           llvm::ArrayRef<OpFoldResult> strides) {
+  assert(shape.size() && strides.size() && shape.size() == strides.size() &&
+         "Shape and strides must be present and of equal size for ui64 "
+         "initialization.");
+
+  llvm::SmallVector<int64_t> staticShape;
+  llvm::SmallVector<int64_t> staticStrides;
+  llvm::SmallVector<Value> dynamicShape;
+  llvm::SmallVector<Value> dynamicStrides;
+
+  dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+
+  auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
+  auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
+
+  build(builder, state, tdesc, source, ValueRange({}), dynamicShape,
+        dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr,
+        staticStridesAttr);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<IntegerType> source,
+                           llvm::ArrayRef<OpFoldResult> shape,
+                           llvm::ArrayRef<OpFoldResult> strides) {
+  assert(shape.size() && strides.size() && shape.size() == strides.size() &&
+         "Shape and strides must be present and of equal size for ui64 "
+         "initialization.");
+
+  llvm::SmallVector<int64_t> staticShape;
+  llvm::SmallVector<int64_t> staticStrides;
+  llvm::SmallVector<Value> dynamicShape;
+  llvm::SmallVector<Value> dynamicStrides;
+
+  dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+
+  auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
+  auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
+
+  build(builder, state, tdesc, source, ValueRange({}), dynamicShape,
+        dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr,
+        staticStridesAttr);
+}
+
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
                            Type tdesc, TypedValue<MemRefType> source,
                            llvm::ArrayRef<OpFoldResult> offsets) {
@@ -125,8 +188,8 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
         ValueRange({}) /* empty dynamic shape */,
         ValueRange({}) /* empty dynamic strides */,
-        staticOffsets /* const offsets */, {} /* empty const shape*/,
-        {} /* empty const strides*/);
+        builder.getDenseI64ArrayAttr(staticOffsets) /* const offsets */,
+        {} /* empty const shape*/, {} /* empty const strides*/);
 }
 
 void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
@@ -197,6 +260,13 @@ LogicalResult CreateNdDescOp::verify() {
     invalidElemTy |= memrefTy.getElementType() != getElementType();
   }
 
+  if (llvm::isa<IntegerType>(getSourceType())) {
+    // strides and shape must present for integer source.
+    if (getMixedStrides().empty() || getMixedSizes().empty())
+      return emitOpError("Expecting strides and shape to be present for "
+                         "integer source.");
+  }
+
   // mismatches among shape, strides, and offsets are
   // already handeled by OffsetSizeAndStrideOpInterface.
   // So they are not check here.
@@ -221,6 +291,53 @@ LogicalResult CreateNdDescOp::verify() {
   return success();
 }
 
+ParseResult parseOptionalDynamicIndexList(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
+    DenseI64ArrayAttr &integers, SmallVectorImpl<Type> *valueTypes = nullptr,
+    AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) {
+
+  SmallVector<int64_t, 4> integerVals;
+  auto parseIntegerOrValue = [&]() {
+    OpAsmParser::UnresolvedOperand operand;
+    auto res = parser.parseOptionalOperand(operand);
+
+    if (res.has_value() && succeeded(res.value())) {
+      values.push_back(operand);
+      integerVals.push_back(ShapedType::kDynamic);
+      if (valueTypes && parser.parseColonType(valueTypes->emplace_back()))
+        return failure();
+    } else {
+      int64_t integer;
+      if (failed(parser.parseInteger(integer)))
+        return failure();
+      integerVals.push_back(integer);
+    }
+    return success();
+  };
+
+  // If the optional values are given there must be left bracket
+  if (parser.parseOptionalLSquare().succeeded()) {
+    if (parser.parseCommaSeparatedList(parseIntegerOrValue) ||
+        parser.parseRSquare())
+      return parser.emitError(parser.getNameLoc())
+             << "expected a list of SSA values or integers";
+    integers = parser.getBuilder().getDenseI64ArrayAttr(integerVals);
+    return success();
+  }
+
+  return success();
+}
+
+void printOptionalDynamicIndexList(
+    OpAsmPrinter &printer, Operation *op, OperandRange values,
+    ArrayRef<int64_t> integers, TypeRange valueTypes = TypeRange(),
+    AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) {
+
+  return printDynamicIndexList(printer, op, values, integers,
+                               /*scalableFlags=*/{}, valueTypes, delimiter);
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_PrefetchNdOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index ddc9e0eb908ac..0c55588eda7a3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -8,13 +8,11 @@
 
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 9558c08c079fc..bef88042fc663 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"
 #include "mlir/Analysis/DataFlow/Utils.h"
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index c072557c2bd22..bc61979c2732b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -135,19 +135,6 @@ static Value resolveDistributedTy(Value orig, T expected,
   return orig;
 }
 
-/// Helper function to filter out the temporary layout attributes attached
-/// during the layout assignment process. These are not needed after going to
-/// SIMT.
-static SmallVector<NamedAttribute>
-removeTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
-  SmallVector<NamedAttribute> newAttrs;
-  for (NamedAttribute attr : attrs) {
-    if (!isa<xegpu::LayoutAttr>(attr.getValue()))
-      newAttrs.push_back(attr);
-  }
-  return newAttrs;
-}
-
 /// Helper function to check if the layout is packed. Layout is packed if it is
 /// 2D and lane_data[0] != 1 (data packed from col dimension).
 static bool hasPackedLayout(xegpu::LayoutAttr layout) {
@@ -197,9 +184,17 @@ struct MoveFuncBodyToWarpExecuteOnLane0
           return isa<gpu::WarpExecuteOnLane0Op>(op);
         }))
       return failure();
-    // Create a new function with the same signature.
+    // Create a new function with the same signature and same attributes.
+    SmallVector<Type> workgroupAttributionsTypes =
+        llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
+                            [](BlockArgument arg) { return arg.getType(); });
+    SmallVector<Type> privateAttributionsTypes =
+        llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
+                            [](BlockArgument arg) { return arg.getType(); });
     auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
-        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType(),
+        workgroupAttributionsTypes, privateAttributionsTypes);
+    newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
     // Create a WarpExecuteOnLane0Op with same arguments and results as the
     // original gpuFuncOp.
     rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
@@ -265,13 +260,13 @@ struct MoveFuncBodyToWarpExecuteOnLane0
 /// ```
 struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+        getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
-          subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+          warpOp, "warp result is not a xegpu::CreateNdDesc op");
     auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
     unsigned operandIdx = operand->getOperandNumber();
 
@@ -288,9 +283,9 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
       newYieldValues.push_back(operand);
       newYieldTypes.push_back(operand.getType());
     }
-    rewriter.setInsertionPoint(subgroupOp);
+    rewriter.setInsertionPoint(warpOp);
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+        rewriter, warpOp, /* new yieled values = */ newYieldValues,
         /* new yielded types = */ newYieldTypes, newRetIndices);
 
     SmallVector<Value> newDescOperands;
@@ -347,10 +342,10 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
     if (!storeOp)
@@ -372,7 +367,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp,
+        rewriter, warpOp,
         /* new yielded values = */
         ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
         /* new yielded types = */
@@ -403,9 +398,9 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
         resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
                              distributedTensorDescTy, rewriter));
 
-    rewriter.create<xegpu::StoreNdOp>(
-        newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
-        removeTemporaryLayoutAttributes(storeOp->getAttrs()));
+    auto newStoreOp = rewriter.create<xegpu::StoreNdOp>(
+        newWarpOp.getLoc(), TypeRange{}, newStoreOperands, storeOp->getAttrs());
+    xegpu::removeLayoutAttrs(newStoreOp);
     rewriter.eraseOp(storeOp);
     return success();
   }
@@ -449,21 +444,22 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
+    OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
+      if (!isa<xegpu::LoadNdOp>(op))
+        return false;
+      // Make sure the same load op is the last operation in the warp op body.
+      // This ensure that load op is not sinked earlier violating any barrier
+      // synchronizations.
+      auto yield = cast<gpu::YieldOp>(
+          warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+      return yield->getPrevNode() == op;
+    });
+
     if (!operand)
       return rewriter.notifyMatchFailure(
-          subgroupOp, "warp result is not a xegpu::LoadNd op");
-    // Make sure the load op is the last operation in the warp op body. This
-    // ensure that load op is not sinked earlier violating any barrier
-    // synchronizations.
-    auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
-    Operation *lastNode = yield->getPrevNode();
-    if (!dyn_cast_or_null<xegpu::LoadNdOp>(lastNode))
-      return failure();
+          warpOp, "warp result is not a xegpu::LoadNd op");
 
     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
     xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
@@ -474,11 +470,11 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
 
     unsigned operandIdx = operand->getOperandNumber();
     VectorType distributedTypeByWarpOp =
-        cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+        cast<VectorType>(warpOp.getResult(operandIdx).getType());
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp,
+        rewriter, warpOp,
         /* new yielded values = */ loadOp.getTensorDesc(),
         /* new yielded types = */ tensorDescTy, newRetIndices);
 
@@ -498,7 +494,8 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
         newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
         resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
                              distributedTensorDescTy, rewriter),
-        removeTemporaryLayoutAttributes(loadOp->getAttrs()));
+        loadOp->getAttrs());
+    xegpu::removeLayoutAttrs(newLoadOp);
     // Set the packed attribute if the layout requires it.
     newLoadOp.setPacked(hasPackedLayout(layout));
     Value distributedVal = newWarpOp.getResult(operandIdx);
@@ -548,12 +545,11 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct DpasDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
+    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
     if (!operand)
-      return rewriter.notifyMatchFailure(subgroupOp,
+      return rewriter.notifyMatchFailure(warpOp,
                                          "warp result is not a xegpu::Dpas op");
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
@@ -599,7 +595,7 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
     // Create a new warp op without the dpas.
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
 
     FailureOr<VectorType> expectedDistLhsTyOrFailure =
         xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
@@ -630,14 +626,16 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
           resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
                                newDpasOperandExpectedTypes[i], rewriter));
     }
-    Value newDpasOp = rewriter.create<xegpu::DpasOp>(
-        newWarpOp->getLoc(), distributedResultTy, newDpasOperands,
-        removeTemporaryLayoutAttributes(dpasOp->getAttrs()));
+    auto newDpasOp =
+        rewriter.create<xegpu::DpasOp>(newWarpOp->getLoc(), distributedResultTy,
+                                       newDpasOperands, dpasOp->getAttrs());
+    xegpu::removeLayoutAttrs(newDpasOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the output type.
-    newDpasOp = resolveDistributedTy(
-        newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter);
-    rewriter.replaceAllUsesWith(distributedVal, newDpasOp);
+    Value typeResolved =
+        resolveDistributedTy(newDpasOp.getResult(),
+                             distResultTypeByWarpOpOrFailure.value(), rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, typeResolved);
     return success();
   }
 };
@@ -678,13 +676,13 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     OpOperand *operand =
-        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
+        getWarpResult(warpOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
     if (!operand)
       return rewriter.notifyMatchFailure(
-          subgroupOp, "warp result is not a xegpu::UpdateNdOffset op");
+          warpOp, "warp result is not a xegpu::UpdateNdOffset op");
     auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
     unsigned operandIdx = operand->getOperandNumber();
     // new update op does not have layout attribute.
@@ -703,7 +701,7 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
     }
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newUpdateOperands;
     for (size_t i : newRetIndices) {
@@ -717,14 +715,15 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
       }
     }
     // Create a new update op outside the warp op.
-    Value newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
+    auto newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
         newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
-        removeTemporaryLayoutAttributes(updateOp->getAttrs()));
+        updateOp->getAttrs());
+    xegpu::removeLayoutAttrs(newUpdateOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
     // Resolve the distributed type with the original type.
-    newUpdateOp =
-        resolveDistributedTy(newUpdateOp, distributedVal.getType(), rewriter);
-    rewriter.replaceAllUsesWith(distributedVal, newUpdateOp);
+    Value typeResolved = resolveDistributedTy(
+        newUpdateOp.getResult(), distributedVal.getType(), rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, typeResolved);
     return success();
   }
 };
@@ -758,10 +757,10 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
 /// ```
 struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
     if (!prefetchOp)
@@ -775,7 +774,7 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
     // Create a new prefetch op outside the warp op with updated tensor
     // descriptor type. Source tensor descriptor require type resolution.
     xegpu::TensorDescType newTensorDescTy =
@@ -783,9 +782,10 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
         newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
-    rewriter.create<xegpu::PrefetchNdOp>(
-        newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
-        removeTemporaryLayoutAttributes(prefetchOp->getAttrs()));
+    rewriter.create<xegpu::PrefetchNdOp>(newWarpOp.getLoc(), TypeRange{},
+                                         newPrefetchOperands,
+                                         prefetchOp->getAttrs());
+    xegpu::removeLayoutAttrs(prefetchOp);
     rewriter.eraseOp(prefetchOp);
     return success();
   }
@@ -795,17 +795,17 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
 /// region. This will simply move the barrier op outside of the warp op.
 struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     auto yield = cast<gpu::YieldOp>(
-        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+        warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     // The last node must be a gpu::BarrierOp.
     auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
     if (!barrierOp)
       return failure();
     // Move the barrier op outside of the warp op.
-    rewriter.setInsertionPointAfter(subgroupOp);
+    rewriter.setInsertionPointAfter(warpOp);
     rewriter.create<gpu::BarrierOp>(
         barrierOp.getLoc(), barrierOp->getResultTypes(),
         barrierOp->getOperands(), barrierOp->getAttrs());
@@ -876,15 +876,32 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   // Step 3: Apply subgroup to workitem distribution patterns.
   RewritePatternSet patterns(&getContext());
   xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  // TODO: distributionFn and shuffleFn are not used at this point.
+  // distributionFn is used by vector distribution patterns to determine the
+  // distributed vector type for a given vector value. In XeGPU subgroup
+  // distribution context, we compute this based on lane layout.
   auto distributionFn = [](Value val) {
     VectorType vecType = dyn_cast<VectorType>(val.getType());
     int64_t vecRank = vecType ? vecType.getRank() : 0;
-    OpBuilder builder(val.getContext());
     if (vecRank == 0)
       return AffineMap::get(val.getContext());
-    return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+    // Get the layout of the vector type.
+    xegpu::LayoutAttr layout = xegpu::getLayoutAttr(val);
+    // If no layout is specified, assume the inner most dimension is distributed
+    // for now.
+    if (!layout)
+      return AffineMap::getMultiDimMapWithTargets(
+          vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
+    SmallVector<unsigned int> distributedDims;
+    // Get the distributed dimensions based on the layout.
+    ArrayRef<int> laneLayout = layout.getLaneLayout().asArrayRef();
+    for (unsigned i = 0; i < laneLayout.size(); ++i) {
+      if (laneLayout[i] > 1)
+        distributedDims.push_back(i);
+    }
+    return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
+                                                val.getContext());
   };
+  // TODO: shuffleFn is not used.
   auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
                       int64_t warpSz) { return Value(); };
   vector::populatePropagateWarpVectorDistributionPatterns(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 13d49cb0e9d82..dc76441b27c02 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -12,16 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
-
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
-#include <numeric>
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 6b85a66a8bd36..370d149ee55af 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -184,6 +184,31 @@ void xegpu::setLayoutAttrs(Operation *op,
   });
 }
 
+template <typename T, typename>
+void xegpu::removeLayoutAttr(const T &operandOrResult) {
+  Operation *owner = operandOrResult.getOwner();
+  std::string name = xegpu::getLayoutName(operandOrResult);
+  if (owner->hasAttrOfType<LayoutAttr>(name))
+    owner->removeAttr(name);
+}
+
+// Explicit instantiation for OpResult
+template void
+xegpu::removeLayoutAttr<mlir::OpResult>(const mlir::OpResult &result);
+
+// Explicit instantiation for OpOperand
+template void
+xegpu::removeLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand);
+
+void xegpu::removeLayoutAttrs(Operation *op) {
+  op->walk([&](Operation *nestOp) {
+    for (OpOperand &opr : nestOp->getOpOperands())
+      removeLayoutAttr(opr);
+    for (OpResult result : nestOp->getOpResults())
+      removeLayoutAttr(result);
+  });
+}
+
 SmallVector<Value>
 xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
                                         Value value, ArrayRef<int64_t> shape) {
diff --git a/mlir/lib/IR/AttributeDetail.h b/mlir/lib/IR/AttributeDetail.h
index 26d40ac3a38f6..cb9d21bf3e611 100644
--- a/mlir/lib/IR/AttributeDetail.h
+++ b/mlir/lib/IR/AttributeDetail.h
@@ -19,11 +19,9 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Support/StorageUniquer.h"
-#include "mlir/Support/ThreadLocalCache.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/Allocator.h"
+#include <mutex>
 
 namespace mlir {
 namespace detail {
@@ -396,27 +394,30 @@ class DistinctAttributeUniquer {
                                               Attribute referencedAttr);
 };
 
-/// An allocator for distinct attribute storage instances. It uses thread local
-/// bump pointer allocators stored in a thread local cache to ensure the storage
-/// is freed after the destruction of the distinct attribute allocator.
-class DistinctAttributeAllocator {
+/// An allocator for distinct attribute storage instances. Uses a synchronized
+/// BumpPtrAllocator to ensure thread-safety. The allocated storage is deleted
+/// when the DistinctAttributeAllocator is destroyed.
+class DistinctAttributeAllocator final {
 public:
   DistinctAttributeAllocator() = default;
-
   DistinctAttributeAllocator(DistinctAttributeAllocator &&) = delete;
   DistinctAttributeAllocator(const DistinctAttributeAllocator &) = delete;
   DistinctAttributeAllocator &
   operator=(const DistinctAttributeAllocator &) = delete;
 
-  /// Allocates a distinct attribute storage using a thread local bump pointer
-  /// allocator to enable synchronization free parallel allocations.
   DistinctAttrStorage *allocate(Attribute referencedAttr) {
-    return new (allocatorCache.get().Allocate<DistinctAttrStorage>())
+    std::scoped_lock<std::mutex> guard(allocatorMutex);
+    return new (allocator.Allocate<DistinctAttrStorage>())
         DistinctAttrStorage(referencedAttr);
-  }
+  };
 
 private:
-  ThreadLocalCache<llvm::BumpPtrAllocator> allocatorCache;
+  /// Used to allocate distict attribute storages. The managed memory is freed
+  /// automatically when the allocator instance is destroyed.
+  llvm::BumpPtrAllocator allocator;
+
+  /// Used to lock access to the allocator.
+  std::mutex allocatorMutex;
 };
 } // namespace detail
 } // namespace mlir
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
index 4cabac185171c..3ef69cea18f0a 100644
--- a/mlir/lib/IR/CMakeLists.txt
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -29,6 +29,7 @@ add_mlir_library(MLIRIR
   ODSSupport.cpp
   Operation.cpp
   OperationSupport.cpp
+  PatternLoggingListener.cpp
   PatternMatch.cpp
   Region.cpp
   RegionKindInterface.cpp
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 716d9c85a377d..06ec1c85fb4d5 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/ThreadPool.h"
diff --git a/mlir/lib/IR/PatternLoggingListener.cpp b/mlir/lib/IR/PatternLoggingListener.cpp
new file mode 100644
index 0000000000000..ce2123ae1a19a
--- /dev/null
+++ b/mlir/lib/IR/PatternLoggingListener.cpp
@@ -0,0 +1,50 @@
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "pattern-logging-listener"
+#define DBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+using namespace mlir;
+
+void RewriterBase::PatternLoggingListener::notifyOperationInserted(
+    Operation *op, InsertPoint previous) {
+  LDBG(patternName << " | notifyOperationInserted"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationInserted(op, previous);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationModified(
+    Operation *op) {
+  LDBG(patternName << " | notifyOperationModified"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationModified(op);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationReplaced(
+    Operation *op, Operation *newOp) {
+  LDBG(patternName << " | notifyOperationReplaced (with op)"
+                   << " | " << op->getName() << " | " << newOp->getName());
+  ForwardingListener::notifyOperationReplaced(op, newOp);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationReplaced(
+    Operation *op, ValueRange replacement) {
+  LDBG(patternName << " | notifyOperationReplaced (with values)"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationReplaced(op, replacement);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationErased(
+    Operation *op) {
+  LDBG(patternName << " | notifyOperationErased"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationErased(op);
+}
+
+void RewriterBase::PatternLoggingListener::notifyPatternBegin(
+    const Pattern &pattern, Operation *op) {
+  LDBG(patternName << " | notifyPatternBegin"
+                   << " | " << op->getName());
+  ForwardingListener::notifyPatternBegin(pattern, op);
+}
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index e87bb461b0329..ca3f7666dba8a 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -9,9 +9,7 @@
 #include <utility>
 
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace mlir;
 
diff --git a/mlir/lib/Interfaces/LoopLikeInterface.cpp b/mlir/lib/Interfaces/LoopLikeInterface.cpp
index 1e0e87b64e811..d4cef29008c2a 100644
--- a/mlir/lib/Interfaces/LoopLikeInterface.cpp
+++ b/mlir/lib/Interfaces/LoopLikeInterface.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Interfaces/LoopLikeInterface.h"
 
 #include "mlir/Interfaces/FunctionInterfaces.h"
-#include "llvm/ADT/DenseSet.h"
 
 using namespace mlir;
 
diff --git a/mlir/lib/Interfaces/SideEffectInterfaces.cpp b/mlir/lib/Interfaces/SideEffectInterfaces.cpp
index 59fd19310cea5..266f6dbacce89 100644
--- a/mlir/lib/Interfaces/SideEffectInterfaces.cpp
+++ b/mlir/lib/Interfaces/SideEffectInterfaces.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "mlir/IR/SymbolTable.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include <utility>
 
 using namespace mlir;
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index dafec39fd5eb0..0db9808b722a7 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Pass/Pass.h"
 #include "PassDetail.h"
 #include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/Threading.h"
 #include "mlir/IR/Verifier.h"
@@ -22,12 +21,9 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include <optional>
 
 using namespace mlir;
diff --git a/mlir/lib/Pass/PassCrashRecovery.cpp b/mlir/lib/Pass/PassCrashRecovery.cpp
index b048ff9462392..3c9735f910094 100644
--- a/mlir/lib/Pass/PassCrashRecovery.cpp
+++ b/mlir/lib/Pass/PassCrashRecovery.cpp
@@ -8,16 +8,13 @@
 
 #include "PassDetail.h"
 #include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
@@ -414,14 +411,19 @@ struct FileReproducerStream : public mlir::ReproducerStream {
 
 LogicalResult PassManager::runWithCrashRecovery(Operation *op,
                                                 AnalysisManager am) {
+  const bool threadingEnabled = getContext()->isMultithreadingEnabled();
   crashReproGenerator->initialize(getPasses(), op, verifyPasses);
 
   // Safely invoke the passes within a recovery context.
   LogicalResult passManagerResult = failure();
   llvm::CrashRecoveryContext recoveryContext;
-  recoveryContext.RunSafelyOnThread(
-      [&] { passManagerResult = runPasses(op, am); });
+  const auto runPassesFn = [&] { passManagerResult = runPasses(op, am); };
+  if (threadingEnabled)
+    recoveryContext.RunSafelyOnThread(runPassesFn);
+  else
+    recoveryContext.RunSafely(runPassesFn);
   crashReproGenerator->finalize(op, passManagerResult);
+
   return passManagerResult;
 }
 
diff --git a/mlir/lib/Pass/PassTiming.cpp b/mlir/lib/Pass/PassTiming.cpp
index 45852455aa737..dab56f09a72eb 100644
--- a/mlir/lib/Pass/PassTiming.cpp
+++ b/mlir/lib/Pass/PassTiming.cpp
@@ -11,7 +11,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Threading.h"
 
-#include <chrono>
 #include <optional>
 
 using namespace mlir;
diff --git a/mlir/lib/Query/Matcher/ErrorBuilder.cpp b/mlir/lib/Query/Matcher/ErrorBuilder.cpp
index de6447dac490a..b55ae05dbee16 100644
--- a/mlir/lib/Query/Matcher/ErrorBuilder.cpp
+++ b/mlir/lib/Query/Matcher/ErrorBuilder.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Query/Matcher/ErrorBuilder.h"
 #include "Diagnostics.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include <initializer_list>
 
diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp
index 637e1f3cdef87..b5a9d2f1d350c 100644
--- a/mlir/lib/Query/Query.cpp
+++ b/mlir/lib/Query/Query.cpp
@@ -13,7 +13,6 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/Query/Matcher/MatchFinder.h"
 #include "mlir/Query/QuerySession.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/mlir/lib/Reducer/ReductionNode.cpp b/mlir/lib/Reducer/ReductionNode.cpp
index d57ee932bfcae..11aeaf77b4642 100644
--- a/mlir/lib/Reducer/ReductionNode.cpp
+++ b/mlir/lib/Reducer/ReductionNode.cpp
@@ -18,7 +18,6 @@
 #include "mlir/IR/IRMapping.h"
 #include "llvm/ADT/STLExtras.h"
 
-#include <algorithm>
 #include <limits>
 
 using namespace mlir;
diff --git a/mlir/lib/Reducer/ReductionTreePass.cpp b/mlir/lib/Reducer/ReductionTreePass.cpp
index 549e4f2bd813b..5b49204013cc0 100644
--- a/mlir/lib/Reducer/ReductionTreePass.cpp
+++ b/mlir/lib/Reducer/ReductionTreePass.cpp
@@ -15,7 +15,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/IR/DialectInterface.h"
-#include "mlir/IR/OpDefinition.h"
 #include "mlir/Reducer/Passes.h"
 #include "mlir/Reducer/ReductionNode.h"
 #include "mlir/Reducer/ReductionPatternInterface.h"
@@ -24,9 +23,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/ManagedStatic.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_REDUCTIONTREEPASS
diff --git a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
index e706d95855435..34f1442ece7e6 100644
--- a/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
+++ b/mlir/lib/Rewrite/FrozenRewritePatternSet.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "ByteCode.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include <optional>
 
diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp
index 4a12183492fd4..b2b372b7b1249 100644
--- a/mlir/lib/Rewrite/PatternApplicator.cpp
+++ b/mlir/lib/Rewrite/PatternApplicator.cpp
@@ -15,6 +15,10 @@
 #include "ByteCode.h"
 #include "llvm/Support/Debug.h"
 
+#ifndef NDEBUG
+#include "llvm/ADT/ScopeExit.h"
+#endif
+
 #define DEBUG_TYPE "pattern-application"
 
 using namespace mlir;
@@ -206,11 +210,19 @@ LogicalResult PatternApplicator::matchAndRewrite(
           } else {
             LLVM_DEBUG(llvm::dbgs() << "Trying to match \""
                                     << bestPattern->getDebugName() << "\"\n");
-
             const auto *pattern =
                 static_cast<const RewritePattern *>(bestPattern);
-            result = pattern->matchAndRewrite(op, rewriter);
 
+#ifndef NDEBUG
+            OpBuilder::Listener *oldListener = rewriter.getListener();
+            auto loggingListener =
+                std::make_unique<RewriterBase::PatternLoggingListener>(
+                    oldListener, pattern->getDebugName());
+            rewriter.setListener(loggingListener.get());
+            auto resetListenerCallback = llvm::make_scope_exit(
+                [&] { rewriter.setListener(oldListener); });
+#endif
+            result = pattern->matchAndRewrite(op, rewriter);
             LLVM_DEBUG(llvm::dbgs()
                        << "\"" << bestPattern->getDebugName() << "\" result "
                        << succeeded(result) << "\n");
diff --git a/mlir/lib/Support/Timing.cpp b/mlir/lib/Support/Timing.cpp
index ac16eb7d224c9..fb6f82c283df5 100644
--- a/mlir/lib/Support/Timing.cpp
+++ b/mlir/lib/Support/Timing.cpp
@@ -19,13 +19,11 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <atomic>
 #include <chrono>
 #include <optional>
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 70029d7e15a90..ff34a0825215c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -422,9 +422,18 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
     ArrayRef<llvm::Value *> operandsRef(operands);
     llvm::CallInst *call;
     if (auto attr = callOp.getCalleeAttr()) {
-      call =
-          builder.CreateCall(moduleTranslation.lookupFunction(attr.getValue()),
-                             operandsRef, opBundles);
+      if (llvm::Function *function =
+              moduleTranslation.lookupFunction(attr.getValue())) {
+        call = builder.CreateCall(function, operandsRef, opBundles);
+      } else {
+        Operation *moduleOp = parentLLVMModule(&opInst);
+        Operation *ifuncOp =
+            moduleTranslation.symbolTable().lookupSymbolIn(moduleOp, attr);
+        llvm::GlobalValue *ifunc = moduleTranslation.lookupIFunc(ifuncOp);
+        llvm::FunctionType *calleeType = llvm::cast<llvm::FunctionType>(
+            moduleTranslation.convertType(callOp.getCalleeFunctionType()));
+        call = builder.CreateCall(calleeType, ifunc, operandsRef, opBundles);
+      }
     } else {
       llvm::FunctionType *calleeType = llvm::cast<llvm::FunctionType>(
           moduleTranslation.convertType(callOp.getCalleeFunctionType()));
@@ -648,18 +657,21 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder,
     LLVM::LLVMFuncOp function =
         addressOfOp.getFunction(moduleTranslation.symbolTable());
     LLVM::AliasOp alias = addressOfOp.getAlias(moduleTranslation.symbolTable());
+    LLVM::IFuncOp ifunc = addressOfOp.getIFunc(moduleTranslation.symbolTable());
 
     // The verifier should not have allowed this.
-    assert((global || function || alias) &&
-           "referencing an undefined global, function, or alias");
+    assert((global || function || alias || ifunc) &&
+           "referencing an undefined global, function, alias, or ifunc");
 
     llvm::Value *llvmValue = nullptr;
     if (global)
       llvmValue = moduleTranslation.lookupGlobal(global);
     else if (alias)
       llvmValue = moduleTranslation.lookupAlias(alias);
-    else
+    else if (function)
       llvmValue = moduleTranslation.lookupFunction(function.getName());
+    else
+      llvmValue = moduleTranslation.lookupIFunc(ifunc);
 
     moduleTranslation.mapValue(addressOfOp.getResult(), llvmValue);
     return success();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index eecca64c4bf81..b3577c6702389 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -118,6 +118,7 @@ static llvm::Intrinsic::ID getMatchSyncIntrinsicId(Type valType,
     return valType.isInteger(32) ? llvm::Intrinsic::nvvm_match_all_sync_i32p
                                  : llvm::Intrinsic::nvvm_match_all_sync_i64p;
   }
+  llvm_unreachable("unsupported match sync kind");
 }
 
 static llvm::Intrinsic::ID getVoteSyncIntrinsicId(NVVM::VoteSyncKind kind) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 68a8d1758e434..3185f28fe6681 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -702,30 +702,6 @@ static void forwardArgs(LLVM::ModuleTranslation &moduleTranslation,
     moduleTranslation.mapValue(arg, moduleTranslation.lookupValue(var));
 }
 
-/// Helper function to map block arguments defined by ignored loop wrappers to
-/// LLVM values and prevent any uses of those from triggering null pointer
-/// dereferences.
-///
-/// This must be called after block arguments of parent wrappers have already
-/// been mapped to LLVM IR values.
-static LogicalResult
-convertIgnoredWrapper(omp::LoopWrapperInterface opInst,
-                      LLVM::ModuleTranslation &moduleTranslation) {
-  // Map block arguments directly to the LLVM value associated to the
-  // corresponding operand. This is semantically equivalent to this wrapper not
-  // being present.
-  return llvm::TypeSwitch<Operation *, LogicalResult>(opInst)
-      .Case([&](omp::SimdOp op) {
-        forwardArgs(moduleTranslation,
-                    cast<omp::BlockArgOpenMPOpInterface>(*op));
-        op.emitWarning() << "simd information on composite construct discarded";
-        return success();
-      })
-      .Default([&](Operation *op) {
-        return op->emitError() << "cannot ignore wrapper";
-      });
-}
-
 /// Converts an OpenMP 'masked' operation into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpMasked(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -2852,17 +2828,6 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   auto simdOp = cast<omp::SimdOp>(opInst);
 
-  // Ignore simd in composite constructs with unsupported clauses
-  // TODO: Replace this once simd + clause combinations are properly supported
-  if (simdOp.isComposite() &&
-      (simdOp.getReductionByref().has_value() || simdOp.getIfExpr())) {
-    if (failed(convertIgnoredWrapper(simdOp, moduleTranslation)))
-      return failure();
-
-    return inlineConvertOmpRegions(simdOp.getRegion(), "omp.simd.region",
-                                   builder, moduleTranslation);
-  }
-
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index bfda223fe0f5f..c807985756539 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1031,6 +1031,16 @@ LogicalResult ModuleImport::convertAliases() {
   return success();
 }
 
+LogicalResult ModuleImport::convertIFuncs() {
+  for (llvm::GlobalIFunc &ifunc : llvmModule->ifuncs()) {
+    if (failed(convertIFunc(&ifunc))) {
+      return emitError(UnknownLoc::get(context))
+             << "unhandled global ifunc: " << diag(ifunc);
+    }
+  }
+  return success();
+}
+
 LogicalResult ModuleImport::convertDataLayout() {
   Location loc = mlirModule.getLoc();
   DataLayoutImporter dataLayoutImporter(context, llvmModule->getDataLayout());
@@ -1369,6 +1379,21 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
   return success();
 }
 
+LogicalResult ModuleImport::convertIFunc(llvm::GlobalIFunc *ifunc) {
+  OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
+
+  Type type = convertType(ifunc->getValueType());
+  llvm::Constant *resolver = ifunc->getResolver();
+  Type resolverType = convertType(resolver->getType());
+  builder.create<IFuncOp>(mlirModule.getLoc(), ifunc->getName(), type,
+                          resolver->getName(), resolverType,
+                          convertLinkageFromLLVM(ifunc->getLinkage()),
+                          ifunc->isDSOLocal(), ifunc->getAddressSpace(),
+                          convertUnnamedAddrFromLLVM(ifunc->getUnnamedAddr()),
+                          convertVisibilityFromLLVM(ifunc->getVisibility()));
+  return success();
+}
+
 LogicalResult ModuleImport::convertGlobal(llvm::GlobalVariable *globalVar) {
   // Insert the global after the last one or at the start of the module.
   OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
@@ -1973,8 +1998,9 @@ ModuleImport::convertCallOperands(llvm::CallBase *callInst,
   // treated as indirect calls to constant operands that need to be converted.
   // Skip the callee operand if it's inline assembly, as it's handled separately
   // in InlineAsmOp.
-  if (!isa<llvm::Function>(callInst->getCalledOperand()) && !isInlineAsm) {
-    FailureOr<Value> called = convertValue(callInst->getCalledOperand());
+  llvm::Value *calleeOperand = callInst->getCalledOperand();
+  if (!isa<llvm::Function, llvm::GlobalIFunc>(calleeOperand) && !isInlineAsm) {
+    FailureOr<Value> called = convertValue(calleeOperand);
     if (failed(called))
       return failure();
     operands.push_back(*called);
@@ -2035,12 +2061,20 @@ ModuleImport::convertFunctionType(llvm::CallBase *callInst,
   if (failed(callType))
     return failure();
   auto *callee = dyn_cast<llvm::Function>(calledOperand);
+
+  llvm::FunctionType *origCalleeType = nullptr;
+  if (callee) {
+    origCalleeType = callee->getFunctionType();
+  } else if (auto *ifunc = dyn_cast<llvm::GlobalIFunc>(calledOperand)) {
+    origCalleeType = cast<llvm::FunctionType>(ifunc->getValueType());
+  }
+
   // For indirect calls, return the type of the call itself.
-  if (!callee)
+  if (!origCalleeType)
     return callType;
 
   FailureOr<LLVMFunctionType> calleeType =
-      castOrFailure(convertType(callee->getFunctionType()));
+      castOrFailure(convertType(origCalleeType));
   if (failed(calleeType))
     return failure();
 
@@ -2059,8 +2093,8 @@ ModuleImport::convertFunctionType(llvm::CallBase *callInst,
 
 FlatSymbolRefAttr ModuleImport::convertCalleeName(llvm::CallBase *callInst) {
   llvm::Value *calledOperand = callInst->getCalledOperand();
-  if (auto *callee = dyn_cast<llvm::Function>(calledOperand))
-    return SymbolRefAttr::get(context, callee->getName());
+  if (isa<llvm::Function, llvm::GlobalIFunc>(calledOperand))
+    return SymbolRefAttr::get(context, calledOperand->getName());
   return {};
 }
 
@@ -3162,6 +3196,8 @@ OwningOpRef<ModuleOp> mlir::translateLLVMIRToModule(
     return {};
   if (failed(moduleImport.convertAliases()))
     return {};
+  if (failed(moduleImport.convertIFuncs()))
+    return {};
   moduleImport.convertTargetTriple();
   return module;
 }
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 8908703cc1368..b997e559885e2 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -791,6 +791,8 @@ void ModuleTranslation::forgetMapping(Region &region) {
           globalsMapping.erase(&op);
         if (isa<LLVM::AliasOp>(op))
           aliasesMapping.erase(&op);
+        if (isa<LLVM::IFuncOp>(op))
+          ifuncMapping.erase(&op);
         if (isa<LLVM::CallOp>(op))
           callMapping.erase(&op);
         llvm::append_range(
@@ -1868,6 +1870,33 @@ LogicalResult ModuleTranslation::convertFunctions() {
   return success();
 }
 
+LogicalResult ModuleTranslation::convertIFuncs() {
+  for (auto op : getModuleBody(mlirModule).getOps<IFuncOp>()) {
+    llvm::Type *type = convertType(op.getIFuncType());
+    llvm::GlobalValue::LinkageTypes linkage =
+        convertLinkageToLLVM(op.getLinkage());
+    llvm::Constant *resolver;
+    if (auto *resolverFn = lookupFunction(op.getResolver())) {
+      resolver = cast<llvm::Constant>(resolverFn);
+    } else {
+      Operation *aliasOp = symbolTable().lookupSymbolIn(parentLLVMModule(op),
+                                                        op.getResolverAttr());
+      resolver = cast<llvm::Constant>(lookupAlias(aliasOp));
+    }
+
+    auto *ifunc =
+        llvm::GlobalIFunc::create(type, op.getAddressSpace(), linkage,
+                                  op.getSymName(), resolver, llvmModule.get());
+    addRuntimePreemptionSpecifier(op.getDsoLocal(), ifunc);
+    ifunc->setUnnamedAddr(convertUnnamedAddrToLLVM(op.getUnnamedAddr()));
+    ifunc->setVisibility(convertVisibilityToLLVM(op.getVisibility_()));
+
+    ifuncMapping.try_emplace(op, ifunc);
+  }
+
+  return success();
+}
+
 LogicalResult ModuleTranslation::convertComdats() {
   for (auto comdatOp : getModuleBody(mlirModule).getOps<ComdatOp>()) {
     for (auto selectorOp : comdatOp.getOps<ComdatSelectorOp>()) {
@@ -2284,6 +2313,8 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
     return nullptr;
   if (failed(translator.convertGlobalsAndAliases()))
     return nullptr;
+  if (failed(translator.convertIFuncs()))
+    return nullptr;
   if (failed(translator.createTBAAMetadata()))
     return nullptr;
   if (failed(translator.createIdentMetadata()))
@@ -2296,7 +2327,8 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
   // Convert other top-level operations if possible.
   for (Operation &o : getModuleBody(module).getOperations()) {
     if (!isa<LLVM::LLVMFuncOp, LLVM::AliasOp, LLVM::GlobalOp,
-             LLVM::GlobalCtorsOp, LLVM::GlobalDtorsOp, LLVM::ComdatOp>(&o) &&
+             LLVM::GlobalCtorsOp, LLVM::GlobalDtorsOp, LLVM::ComdatOp,
+             LLVM::IFuncOp>(&o) &&
         !o.hasTrait<OpTrait::IsTerminator>() &&
         failed(translator.convertOperation(o, llvmBuilder))) {
       return nullptr;
diff --git a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
index 55d6a380d0bff..9fa03725d05ee 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp
@@ -45,6 +45,12 @@ Value spirv::Deserializer::getValue(uint32_t id) {
     return opBuilder.create<spirv::ConstantOp>(unknownLoc, constInfo->second,
                                                constInfo->first);
   }
+  if (std::optional<std::pair<Attribute, Type>> constCompositeReplicateInfo =
+          getConstantCompositeReplicate(id)) {
+    return opBuilder.create<spirv::EXTConstantCompositeReplicateOp>(
+        unknownLoc, constCompositeReplicateInfo->second,
+        constCompositeReplicateInfo->first);
+  }
   if (auto varOp = getGlobalVariable(id)) {
     auto addressOfOp = opBuilder.create<spirv::AddressOfOp>(
         unknownLoc, varOp.getType(), SymbolRefAttr::get(varOp.getOperation()));
@@ -56,10 +62,18 @@ Value spirv::Deserializer::getValue(uint32_t id) {
         SymbolRefAttr::get(constOp.getOperation()));
     return referenceOfOp.getReference();
   }
-  if (auto constCompositeOp = getSpecConstantComposite(id)) {
+  if (SpecConstantCompositeOp specConstCompositeOp =
+          getSpecConstantComposite(id)) {
+    auto referenceOfOp = opBuilder.create<spirv::ReferenceOfOp>(
+        unknownLoc, specConstCompositeOp.getType(),
+        SymbolRefAttr::get(specConstCompositeOp.getOperation()));
+    return referenceOfOp.getReference();
+  }
+  if (auto specConstCompositeReplicateOp =
+          getSpecConstantCompositeReplicate(id)) {
     auto referenceOfOp = opBuilder.create<spirv::ReferenceOfOp>(
-        unknownLoc, constCompositeOp.getType(),
-        SymbolRefAttr::get(constCompositeOp.getOperation()));
+        unknownLoc, specConstCompositeReplicateOp.getType(),
+        SymbolRefAttr::get(specConstCompositeReplicateOp.getOperation()));
     return referenceOfOp.getReference();
   }
   if (auto specConstOperationInfo = getSpecConstantOperation(id)) {
@@ -175,8 +189,12 @@ LogicalResult spirv::Deserializer::processInstruction(
     return processConstant(operands, /*isSpec=*/true);
   case spirv::Opcode::OpConstantComposite:
     return processConstantComposite(operands);
+  case spirv::Opcode::OpConstantCompositeReplicateEXT:
+    return processConstantCompositeReplicateEXT(operands);
   case spirv::Opcode::OpSpecConstantComposite:
     return processSpecConstantComposite(operands);
+  case spirv::Opcode::OpSpecConstantCompositeReplicateEXT:
+    return processSpecConstantCompositeReplicateEXT(operands);
   case spirv::Opcode::OpSpecConstantOp:
     return processSpecConstantOperation(operands);
   case spirv::Opcode::OpConstantTrue:
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index b1abd8b3dffe9..d133d0332e271 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -678,6 +678,14 @@ spirv::Deserializer::getConstant(uint32_t id) {
   return constIt->getSecond();
 }
 
+std::optional<std::pair<Attribute, Type>>
+spirv::Deserializer::getConstantCompositeReplicate(uint32_t id) {
+  if (auto it = constantCompositeReplicateMap.find(id);
+      it != constantCompositeReplicateMap.end())
+    return it->second;
+  return std::nullopt;
+}
+
 std::optional<spirv::SpecConstOperationMaterializationInfo>
 spirv::Deserializer::getSpecConstantOperation(uint32_t id) {
   auto constIt = specConstOperationMap.find(id);
@@ -1554,15 +1562,63 @@ spirv::Deserializer::processConstantComposite(ArrayRef<uint32_t> operands) {
   return success();
 }
 
+LogicalResult spirv::Deserializer::processConstantCompositeReplicateEXT(
+    ArrayRef<uint32_t> operands) {
+  if (operands.size() != 3) {
+    return emitError(
+               unknownLoc,
+               "OpConstantCompositeReplicateEXT expects 3 operands but found ")
+           << operands.size();
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  auto compositeType = dyn_cast<CompositeType>(resultType);
+  if (!compositeType) {
+    return emitError(unknownLoc,
+                     "result type from <id> is not a composite type")
+           << operands[0];
+  }
+
+  uint32_t resultID = operands[1];
+  uint32_t constantID = operands[2];
+
+  std::optional<std::pair<Attribute, Type>> constantInfo =
+      getConstant(constantID);
+  if (constantInfo.has_value()) {
+    constantCompositeReplicateMap.try_emplace(
+        resultID, constantInfo.value().first, resultType);
+    return success();
+  }
+
+  std::optional<std::pair<Attribute, Type>> replicatedConstantCompositeInfo =
+      getConstantCompositeReplicate(constantID);
+  if (replicatedConstantCompositeInfo.has_value()) {
+    constantCompositeReplicateMap.try_emplace(
+        resultID, replicatedConstantCompositeInfo.value().first, resultType);
+    return success();
+  }
+
+  return emitError(unknownLoc, "OpConstantCompositeReplicateEXT operand <id> ")
+         << constantID
+         << " must come from a normal constant or a "
+            "OpConstantCompositeReplicateEXT";
+}
+
 LogicalResult
 spirv::Deserializer::processSpecConstantComposite(ArrayRef<uint32_t> operands) {
   if (operands.size() < 2) {
-    return emitError(unknownLoc,
-                     "OpConstantComposite must have type <id> and result <id>");
+    return emitError(
+        unknownLoc,
+        "OpSpecConstantComposite must have type <id> and result <id>");
   }
   if (operands.size() < 3) {
     return emitError(unknownLoc,
-                     "OpConstantComposite must have at least 1 parameter");
+                     "OpSpecConstantComposite must have at least 1 parameter");
   }
 
   Type resultType = getType(operands[0]);
@@ -1589,6 +1645,41 @@ spirv::Deserializer::processSpecConstantComposite(ArrayRef<uint32_t> operands) {
   return success();
 }
 
+LogicalResult spirv::Deserializer::processSpecConstantCompositeReplicateEXT(
+    ArrayRef<uint32_t> operands) {
+  if (operands.size() != 3) {
+    return emitError(unknownLoc, "OpSpecConstantCompositeReplicateEXT expects "
+                                 "3 operands but found ")
+           << operands.size();
+  }
+
+  Type resultType = getType(operands[0]);
+  if (!resultType) {
+    return emitError(unknownLoc, "undefined result type from <id> ")
+           << operands[0];
+  }
+
+  auto compositeType = dyn_cast<CompositeType>(resultType);
+  if (!compositeType) {
+    return emitError(unknownLoc,
+                     "result type from <id> is not a composite type")
+           << operands[0];
+  }
+
+  uint32_t resultID = operands[1];
+
+  auto symName = opBuilder.getStringAttr(getSpecConstantSymbol(resultID));
+  spirv::SpecConstantOp constituentSpecConstantOp =
+      getSpecConstant(operands[2]);
+  auto op = opBuilder.create<spirv::EXTSpecConstantCompositeReplicateOp>(
+      unknownLoc, TypeAttr::get(resultType), symName,
+      SymbolRefAttr::get(constituentSpecConstantOp));
+
+  specConstCompositeReplicateMap[resultID] = op;
+
+  return success();
+}
+
 LogicalResult
 spirv::Deserializer::processSpecConstantOperation(ArrayRef<uint32_t> operands) {
   if (operands.size() < 3)
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
index 1bc9e4a3c75d8..20482bd2bf501 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h
@@ -190,6 +190,11 @@ class Deserializer {
   /// Gets the constant's attribute and type associated with the given <id>.
   std::optional<std::pair<Attribute, Type>> getConstant(uint32_t id);
 
+  /// Gets the replicated composite constant's attribute and type associated
+  /// with the given <id>.
+  std::optional<std::pair<Attribute, Type>>
+  getConstantCompositeReplicate(uint32_t id);
+
   /// Gets the info needed to materialize the spec constant operation op
   /// associated with the given <id>.
   std::optional<SpecConstOperationMaterializationInfo>
@@ -220,6 +225,13 @@ class Deserializer {
     return specConstCompositeMap.lookup(id);
   }
 
+  /// Gets the replicated composite specialization constant with the given
+  /// result <id>.
+  spirv::EXTSpecConstantCompositeReplicateOp
+  getSpecConstantCompositeReplicate(uint32_t id) {
+    return specConstCompositeReplicateMap.lookup(id);
+  }
+
   /// Creates a spirv::SpecConstantOp.
   spirv::SpecConstantOp createSpecConstant(Location loc, uint32_t resultID,
                                            TypedAttr defaultValue);
@@ -313,10 +325,20 @@ class Deserializer {
   /// `operands`.
   LogicalResult processConstantComposite(ArrayRef<uint32_t> operands);
 
+  /// Processes a SPIR-V OpConstantCompositeReplicateEXT instruction with
+  /// the given `operands`.
+  LogicalResult
+  processConstantCompositeReplicateEXT(ArrayRef<uint32_t> operands);
+
   /// Processes a SPIR-V OpSpecConstantComposite instruction with the given
   /// `operands`.
   LogicalResult processSpecConstantComposite(ArrayRef<uint32_t> operands);
 
+  /// Processes a SPIR-V OpSpecConstantCompositeReplicateEXT instruction with
+  /// the given `operands`.
+  LogicalResult
+  processSpecConstantCompositeReplicateEXT(ArrayRef<uint32_t> operands);
+
   /// Processes a SPIR-V OpSpecConstantOp instruction with the given
   /// `operands`.
   LogicalResult processSpecConstantOperation(ArrayRef<uint32_t> operands);
@@ -549,12 +571,28 @@ class Deserializer {
   /// (and type) here. Later when it's used, we materialize the constant.
   DenseMap<uint32_t, std::pair<Attribute, Type>> constantMap;
 
+  // Result <id> to replicated constant attribute and type mapping.
+  ///
+  /// In the SPIR-V binary format, OpConstantCompositeReplicateEXT is placed in
+  /// the module and shared by instructions at module level and in subsequent
+  /// functions. But in the SPIR-V dialect, this is materialized to where
+  /// it's used in the function. So when seeing a
+  /// OpConstantCompositeReplicateEXT in the binary format, we don't immediately
+  /// emit a `spirv.EXT.ConstantCompositeReplicate` op into the module, we keep
+  /// the id of its value and type here. Later when it's used, we materialize
+  /// the `spirv.EXT.ConstantCompositeReplicate`.
+  DenseMap<uint32_t, std::pair<Attribute, Type>> constantCompositeReplicateMap;
+
   // Result <id> to spec constant mapping.
   DenseMap<uint32_t, spirv::SpecConstantOp> specConstMap;
 
   // Result <id> to composite spec constant mapping.
   DenseMap<uint32_t, spirv::SpecConstantCompositeOp> specConstCompositeMap;
 
+  // Result <id> to replicated composite spec constant mapping.
+  DenseMap<uint32_t, spirv::EXTSpecConstantCompositeReplicateOp>
+      specConstCompositeReplicateMap;
+
   /// Result <id> to info needed to materialize an OpSpecConstantOp
   /// mapping.
   DenseMap<uint32_t, SpecConstOperationMaterializationInfo>
diff --git a/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp b/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
index ff3cc92ee8078..d62529b85b3aa 100644
--- a/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
@@ -66,6 +66,16 @@ LogicalResult Serializer::processConstantOp(spirv::ConstantOp op) {
   return failure();
 }
 
+LogicalResult Serializer::processConstantCompositeReplicateOp(
+    spirv::EXTConstantCompositeReplicateOp op) {
+  if (uint32_t resultID = prepareConstantCompositeReplicate(
+          op.getLoc(), op.getType(), op.getValue())) {
+    valueIDMap[op.getResult()] = resultID;
+    return success();
+  }
+  return failure();
+}
+
 LogicalResult Serializer::processSpecConstantOp(spirv::SpecConstantOp op) {
   if (auto resultID = prepareConstantScalar(op.getLoc(), op.getDefaultValue(),
                                             /*isSpec=*/true)) {
@@ -118,6 +128,38 @@ Serializer::processSpecConstantCompositeOp(spirv::SpecConstantCompositeOp op) {
   return processName(resultID, op.getSymName());
 }
 
+LogicalResult Serializer::processSpecConstantCompositeReplicateOp(
+    spirv::EXTSpecConstantCompositeReplicateOp op) {
+  uint32_t typeID = 0;
+  if (failed(processType(op.getLoc(), op.getType(), typeID))) {
+    return failure();
+  }
+
+  auto constituent = dyn_cast<FlatSymbolRefAttr>(op.getConstituent());
+  if (!constituent)
+    return op.emitError(
+               "expected flat symbol reference for constituent instead of ")
+           << op.getConstituent();
+
+  StringRef constituentName = constituent.getValue();
+  uint32_t constituentID = getSpecConstID(constituentName);
+  if (!constituentID) {
+    return op.emitError("unknown result <id> for replicated spec constant ")
+           << constituentName;
+  }
+
+  uint32_t resultID = getNextID();
+  uint32_t operands[] = {typeID, resultID, constituentID};
+
+  encodeInstructionInto(typesGlobalValues,
+                        spirv::Opcode::OpSpecConstantCompositeReplicateEXT,
+                        operands);
+
+  specConstIDMap[op.getSymName()] = resultID;
+
+  return processName(resultID, op.getSymName());
+}
+
 LogicalResult
 Serializer::processSpecConstantOperationOp(spirv::SpecConstantOperationOp op) {
   uint32_t typeID = 0;
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index ebebd2d283afa..3400fcf6374e8 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -1109,6 +1109,55 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
   return resultID;
 }
 
+uint32_t Serializer::prepareConstantCompositeReplicate(Location loc,
+                                                       Type resultType,
+                                                       Attribute valueAttr) {
+  std::pair<Attribute, Type> valueTypePair{valueAttr, resultType};
+  if (uint32_t id = getConstantCompositeReplicateID(valueTypePair)) {
+    return id;
+  }
+
+  uint32_t typeID = 0;
+  if (failed(processType(loc, resultType, typeID))) {
+    return 0;
+  }
+
+  Type valueType;
+  if (auto typedAttr = dyn_cast<TypedAttr>(valueAttr)) {
+    valueType = typedAttr.getType();
+  } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+    auto typedElemAttr = dyn_cast<TypedAttr>(arrayAttr[0]);
+    if (!typedElemAttr)
+      return 0;
+    valueType =
+        spirv::ArrayType::get(typedElemAttr.getType(), arrayAttr.size());
+  } else {
+    return 0;
+  }
+
+  auto compositeType = dyn_cast<CompositeType>(resultType);
+  if (!compositeType)
+    return 0;
+  Type elementType = compositeType.getElementType(0);
+
+  uint32_t constandID;
+  if (elementType == valueType) {
+    constandID = prepareConstant(loc, elementType, valueAttr);
+  } else {
+    constandID = prepareConstantCompositeReplicate(loc, elementType, valueAttr);
+  }
+
+  uint32_t resultID = getNextID();
+  uint32_t operands[] = {typeID, resultID, constandID};
+
+  encodeInstructionInto(typesGlobalValues,
+                        spirv::Opcode::OpConstantCompositeReplicateEXT,
+                        operands);
+
+  constCompositeReplicateIDMap[valueTypePair] = resultID;
+  return resultID;
+}
+
 //===----------------------------------------------------------------------===//
 // Control flow
 //===----------------------------------------------------------------------===//
@@ -1328,6 +1377,9 @@ LogicalResult Serializer::processOperation(Operation *opInst) {
         return processBranchConditionalOp(op);
       })
       .Case([&](spirv::ConstantOp op) { return processConstantOp(op); })
+      .Case([&](spirv::EXTConstantCompositeReplicateOp op) {
+        return processConstantCompositeReplicateOp(op);
+      })
       .Case([&](spirv::FuncOp op) { return processFuncOp(op); })
       .Case([&](spirv::GlobalVariableOp op) {
         return processGlobalVariableOp(op);
@@ -1339,6 +1391,9 @@ LogicalResult Serializer::processOperation(Operation *opInst) {
       .Case([&](spirv::SpecConstantCompositeOp op) {
         return processSpecConstantCompositeOp(op);
       })
+      .Case([&](spirv::EXTSpecConstantCompositeReplicateOp op) {
+        return processSpecConstantCompositeReplicateOp(op);
+      })
       .Case([&](spirv::SpecConstantOperationOp op) {
         return processSpecConstantOperationOp(op);
       })
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.h b/mlir/lib/Target/SPIRV/Serialization/Serializer.h
index 9edb0f4af008d..7047869bca4cd 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.h
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.h
@@ -108,11 +108,17 @@ class Serializer {
 
   LogicalResult processConstantOp(spirv::ConstantOp op);
 
+  LogicalResult processConstantCompositeReplicateOp(
+      spirv::EXTConstantCompositeReplicateOp op);
+
   LogicalResult processSpecConstantOp(spirv::SpecConstantOp op);
 
   LogicalResult
   processSpecConstantCompositeOp(spirv::SpecConstantCompositeOp op);
 
+  LogicalResult processSpecConstantCompositeReplicateOp(
+      spirv::EXTSpecConstantCompositeReplicateOp op);
+
   LogicalResult
   processSpecConstantOperationOp(spirv::SpecConstantOperationOp op);
 
@@ -191,6 +197,11 @@ class Serializer {
     return constIDMap.lookup(value);
   }
 
+  uint32_t getConstantCompositeReplicateID(
+      std::pair<Attribute, Type> valueTypePair) const {
+    return constCompositeReplicateIDMap.lookup(valueTypePair);
+  }
+
   /// Main dispatch method for processing a constant with the given `constType`
   /// and `valueAttr`. `constType` is needed here because we can interpret the
   /// `valueAttr` as a different type than the type of `valueAttr` itself; for
@@ -230,6 +241,12 @@ class Serializer {
   uint32_t prepareConstantFp(Location loc, FloatAttr floatAttr,
                              bool isSpec = false);
 
+  /// Prepares `spirv.EXTConstantCompositeReplicateOp` serialization. This
+  /// method emits OpConstantCompositeReplicateEXT and returns the result <id>
+  /// associated with it.
+  uint32_t prepareConstantCompositeReplicate(Location loc, Type resultType,
+                                             Attribute valueAttr);
+
   //===--------------------------------------------------------------------===//
   // Control flow
   //===--------------------------------------------------------------------===//
@@ -389,6 +406,9 @@ class Serializer {
   /// Map from constant values to their <id>s.
   DenseMap<Attribute, uint32_t> constIDMap;
 
+  /// Map from a replicated composite constant's value and type to their <id>s.
+  DenseMap<std::pair<Attribute, Type>, uint32_t> constCompositeReplicateIDMap;
+
   /// Map from specialization constant names to their <id>s.
   llvm::StringMap<uint32_t> specConstIDMap;
 
diff --git a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
index 611b734dc4b05..9670285e767b8 100644
--- a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
@@ -13,12 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Tools/PDLL/CodeGen/CPPGen.h"
-#include "mlir/Dialect/PDL/IR/PDL.h"
 #include "mlir/Dialect/PDL/IR/PDLOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Tools/PDLL/AST/Nodes.h"
 #include "mlir/Tools/PDLL/ODS/Operation.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TypeSwitch.h"
diff --git a/mlir/lib/Tools/PDLL/ODS/Context.cpp b/mlir/lib/Tools/PDLL/ODS/Context.cpp
index 61a9df92c8047..a3933c9c73b20 100644
--- a/mlir/lib/Tools/PDLL/ODS/Context.cpp
+++ b/mlir/lib/Tools/PDLL/ODS/Context.cpp
@@ -12,7 +12,6 @@
 #include "mlir/Tools/PDLL/ODS/Operation.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
-#include <optional>
 
 using namespace mlir;
 using namespace mlir::pdll::ods;
diff --git a/mlir/lib/Tools/PDLL/ODS/Dialect.cpp b/mlir/lib/Tools/PDLL/ODS/Dialect.cpp
index b4654a6ad5b2e..5e4ea5063b78d 100644
--- a/mlir/lib/Tools/PDLL/ODS/Dialect.cpp
+++ b/mlir/lib/Tools/PDLL/ODS/Dialect.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Tools/PDLL/ODS/Dialect.h"
-#include "mlir/Tools/PDLL/ODS/Constraint.h"
 #include "mlir/Tools/PDLL/ODS/Operation.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace mlir::pdll::ods;
diff --git a/mlir/lib/Tools/PDLL/Parser/Parser.cpp b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
index 50e10447ee468..51e702a1bb53a 100644
--- a/mlir/lib/Tools/PDLL/Parser/Parser.cpp
+++ b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
@@ -25,7 +25,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/TableGen/Error.h"
diff --git a/mlir/lib/Tools/lsp-server-support/Logging.cpp b/mlir/lib/Tools/lsp-server-support/Logging.cpp
index 6335fbfb5dfb8..373e2165c244d 100644
--- a/mlir/lib/Tools/lsp-server-support/Logging.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Logging.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Tools/lsp-server-support/Logging.h"
 #include "llvm/Support/Chrono.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Tools/lsp-server-support/Protocol.cpp b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
index 0054dc37b39cb..33cdd2855eff1 100644
--- a/mlir/lib/Tools/lsp-server-support/Protocol.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Protocol.cpp
@@ -11,14 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Tools/lsp-server-support/Protocol.h"
-#include "mlir/Tools/lsp-server-support/Logging.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
diff --git a/mlir/lib/Tools/lsp-server-support/Transport.cpp b/mlir/lib/Tools/lsp-server-support/Transport.cpp
index d0863ba0ae087..5a098b2841f4b 100644
--- a/mlir/lib/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Transport.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Tools/lsp-server-support/Logging.h"
 #include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/Errno.h"
 #include "llvm/Support/Error.h"
 #include <optional>
 #include <system_error>
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 31e0caa768113..8f785900780f5 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -15,20 +15,15 @@
 #include "mlir/Bytecode/BytecodeWriter.h"
 #include "mlir/Debug/CLOptionsSetup.h"
 #include "mlir/Debug/Counter.h"
-#include "mlir/Debug/DebuggerExecutionContextHook.h"
-#include "mlir/Debug/ExecutionContext.h"
-#include "mlir/Debug/Observers/ActionLogging.h"
 #include "mlir/Dialect/IRDL/IR/IRDL.h"
 #include "mlir/Dialect/IRDL/IRDLLoading.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/FileUtilities.h"
@@ -39,14 +34,12 @@
 #include "mlir/Tools/Plugins/PassPlugin.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/StringSaver.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/ToolOutputFile.h"
 
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
index c15f17025e3c1..82542a12a1807 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
@@ -12,8 +12,6 @@
 #include "Protocol.h"
 #include "mlir/Tools/lsp-server-support/Logging.h"
 #include "mlir/Tools/lsp-server-support/Transport.h"
-#include "llvm/ADT/FunctionExtras.h"
-#include "llvm/ADT/StringMap.h"
 #include <optional>
 
 #define DEBUG_TYPE "pdll-lsp-server"
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp
index 27a1b712ae801..0c9896e3ec1b4 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/Protocol.cpp
@@ -11,14 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Protocol.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/JSON.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace mlir::lsp;
diff --git a/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp b/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp
index a9cde4acf4ff8..e89d392d375e8 100644
--- a/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp
+++ b/mlir/lib/Tools/mlir-reduce/MlirReduceMain.cpp
@@ -14,12 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Tools/mlir-reduce/MlirReduceMain.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Reducer/Passes.h"
-#include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Tools/ParseUtilities.h"
 #include "llvm/Support/InitLLVM.h"
diff --git a/mlir/lib/Tools/mlir-translate/Translation.cpp b/mlir/lib/Tools/mlir-translate/Translation.cpp
index baacfb413a3fe..61a247034c2db 100644
--- a/mlir/lib/Tools/mlir-translate/Translation.cpp
+++ b/mlir/lib/Tools/mlir-translate/Translation.cpp
@@ -12,8 +12,6 @@
 
 #include "mlir/Tools/mlir-translate/Translation.h"
 #include "mlir/IR/AsmState.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Tools/ParseUtilities.h"
diff --git a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
index 993eea376bc26..bb3c0a77747aa 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
@@ -12,8 +12,6 @@
 #include "mlir/Tools/lsp-server-support/Logging.h"
 #include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "mlir/Tools/lsp-server-support/Transport.h"
-#include "llvm/ADT/FunctionExtras.h"
-#include "llvm/ADT/StringMap.h"
 #include <optional>
 
 using namespace mlir;
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 4d01a83d37165..4c4ce3cb41fd5 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -789,26 +789,13 @@ enum MaterializationKind {
   Source
 };
 
-/// An unresolved materialization, i.e., a "builtin.unrealized_conversion_cast"
-/// op. Unresolved materializations are erased at the end of the dialect
-/// conversion.
-class UnresolvedMaterializationRewrite : public OperationRewrite {
+/// Helper class that stores metadata about an unresolved materialization.
+class UnresolvedMaterializationInfo {
 public:
-  UnresolvedMaterializationRewrite(ConversionPatternRewriterImpl &rewriterImpl,
-                                   UnrealizedConversionCastOp op,
-                                   const TypeConverter *converter,
-                                   MaterializationKind kind, Type originalType,
-                                   ValueVector mappedValues);
-
-  static bool classof(const IRRewrite *rewrite) {
-    return rewrite->getKind() == Kind::UnresolvedMaterialization;
-  }
-
-  void rollback() override;
-
-  UnrealizedConversionCastOp getOperation() const {
-    return cast<UnrealizedConversionCastOp>(op);
-  }
+  UnresolvedMaterializationInfo() = default;
+  UnresolvedMaterializationInfo(const TypeConverter *converter,
+                                MaterializationKind kind, Type originalType)
+      : converterAndKind(converter, kind), originalType(originalType) {}
 
   /// Return the type converter of this materialization (which may be null).
   const TypeConverter *getConverter() const {
@@ -832,7 +819,30 @@ class UnresolvedMaterializationRewrite : public OperationRewrite {
   /// The original type of the SSA value. Only used for target
   /// materializations.
   Type originalType;
+};
+
+/// An unresolved materialization, i.e., a "builtin.unrealized_conversion_cast"
+/// op. Unresolved materializations fold away or are replaced with
+/// source/target materializations at the end of the dialect conversion.
+class UnresolvedMaterializationRewrite : public OperationRewrite {
+public:
+  UnresolvedMaterializationRewrite(ConversionPatternRewriterImpl &rewriterImpl,
+                                   UnrealizedConversionCastOp op,
+                                   ValueVector mappedValues)
+      : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op),
+        mappedValues(std::move(mappedValues)) {}
+
+  static bool classof(const IRRewrite *rewrite) {
+    return rewrite->getKind() == Kind::UnresolvedMaterialization;
+  }
+
+  void rollback() override;
 
+  UnrealizedConversionCastOp getOperation() const {
+    return cast<UnrealizedConversionCastOp>(op);
+  }
+
+private:
   /// The values in the conversion value mapping that are being replaced by the
   /// results of this unresolved materialization.
   ValueVector mappedValues;
@@ -1088,9 +1098,8 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// by the current pattern.
   SetVector<Block *> patternInsertedBlocks;
 
-  /// A mapping of all unresolved materializations (UnrealizedConversionCastOp)
-  /// to the corresponding rewrite objects.
-  DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationRewrite *>
+  /// A mapping for looking up metadata of unresolved materializations.
+  DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationInfo>
       unresolvedMaterializations;
 
   /// The current type converter, or nullptr if no type converter is currently
@@ -1210,18 +1219,6 @@ void CreateOperationRewrite::rollback() {
   op->erase();
 }
 
-UnresolvedMaterializationRewrite::UnresolvedMaterializationRewrite(
-    ConversionPatternRewriterImpl &rewriterImpl, UnrealizedConversionCastOp op,
-    const TypeConverter *converter, MaterializationKind kind, Type originalType,
-    ValueVector mappedValues)
-    : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op),
-      converterAndKind(converter, kind), originalType(originalType),
-      mappedValues(std::move(mappedValues)) {
-  assert((!originalType || kind == MaterializationKind::Target) &&
-         "original type is valid only for target materializations");
-  rewriterImpl.unresolvedMaterializations[op] = this;
-}
-
 void UnresolvedMaterializationRewrite::rollback() {
   if (!mappedValues.empty())
     rewriterImpl.mapping.erase(mappedValues);
@@ -1510,8 +1507,10 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
     mapping.map(valuesToMap, convertOp.getResults());
   if (castOp)
     *castOp = convertOp;
-  appendRewrite<UnresolvedMaterializationRewrite>(
-      convertOp, converter, kind, originalType, std::move(valuesToMap));
+  unresolvedMaterializations[convertOp] =
+      UnresolvedMaterializationInfo(converter, kind, originalType);
+  appendRewrite<UnresolvedMaterializationRewrite>(convertOp,
+                                                  std::move(valuesToMap));
   return convertOp.getResults();
 }
 
@@ -2678,21 +2677,21 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
 
 static LogicalResult
 legalizeUnresolvedMaterialization(RewriterBase &rewriter,
-                                  UnresolvedMaterializationRewrite *rewrite) {
-  UnrealizedConversionCastOp op = rewrite->getOperation();
+                                  UnrealizedConversionCastOp op,
+                                  const UnresolvedMaterializationInfo &info) {
   assert(!op.use_empty() &&
          "expected that dead materializations have already been DCE'd");
   Operation::operand_range inputOperands = op.getOperands();
 
   // Try to materialize the conversion.
-  if (const TypeConverter *converter = rewrite->getConverter()) {
+  if (const TypeConverter *converter = info.getConverter()) {
     rewriter.setInsertionPoint(op);
     SmallVector<Value> newMaterialization;
-    switch (rewrite->getMaterializationKind()) {
+    switch (info.getMaterializationKind()) {
     case MaterializationKind::Target:
       newMaterialization = converter->materializeTargetConversion(
           rewriter, op->getLoc(), op.getResultTypes(), inputOperands,
-          rewrite->getOriginalType());
+          info.getOriginalType());
       break;
     case MaterializationKind::Source:
       assert(op->getNumResults() == 1 && "expected single result");
@@ -2767,7 +2766,7 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
 
   // Gather all unresolved materializations.
   SmallVector<UnrealizedConversionCastOp> allCastOps;
-  const DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationRewrite *>
+  const DenseMap<UnrealizedConversionCastOp, UnresolvedMaterializationInfo>
       &materializations = rewriterImpl.unresolvedMaterializations;
   for (auto it : materializations)
     allCastOps.push_back(it.first);
@@ -2784,7 +2783,8 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
     for (UnrealizedConversionCastOp castOp : remainingCastOps) {
       auto it = materializations.find(castOp);
       assert(it != materializations.end() && "inconsistent state");
-      if (failed(legalizeUnresolvedMaterialization(rewriter, it->second)))
+      if (failed(
+              legalizeUnresolvedMaterialization(rewriter, castOp, it->second)))
         return failure();
     }
   }
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index ed476da28d6be..be71737e4b5b4 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -200,6 +200,7 @@ class _OperationBase:
     def get_asm(
         binary: Literal[True],
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -212,6 +213,7 @@ class _OperationBase:
         self,
         binary: bool = False,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -253,6 +255,7 @@ class _OperationBase:
     def print(
         self,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -270,6 +273,10 @@ class _OperationBase:
           binary: Whether to write bytes (True) or str (False). Defaults to False.
           large_elements_limit: Whether to elide elements attributes above this
             number of elements. Defaults to None (no limit).
+          large_resource_limit: Whether to elide resource strings above this
+            number of characters. Defaults to None (no limit). If large_elements_limit
+            is set and this is None, the behavior will be to use large_elements_limit
+            as large_resource_limit.
           enable_debug_info: Whether to print debug/location information. Defaults
             to False.
           pretty_debug_info: Whether to format debug information for easier reading
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
index 0d2eaffe16d3e..1010daddae2aa 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
@@ -23,6 +23,7 @@ class PassManager:
         print_after_change: bool = False,
         print_after_failure: bool = False,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         print_generic_op_form: bool = False,
         tree_printing_dir_path: str | None = None,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
index 77103fa5c25f1..e48c94195ea56 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
   // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
   // CHECK: %[[ALLOC:.*]] = memref.alloc()
   // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]]
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
   // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
   // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
-  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
+  // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64
+  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]]
   // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
   %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
   %c0 = arith.constant 0 : index
@@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
   // CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]]
 
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64
+  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]]
   // CHECK: %[[C32:.*]] = arith.constant 32 : index
diff --git a/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir b/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir
new file mode 100644
index 0000000000000..bae7c5986ef9e
--- /dev/null
+++ b/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-opt %s -convert-complex-to-rocdl-library-calls | FileCheck %s
+
+// CHECK-DAG: @__ocml_cabs_f32(complex<f32>) -> f32
+// CHECK-DAG: @__ocml_cabs_f64(complex<f64>) -> f64
+// CHECK-DAG: @__ocml_cexp_f32(complex<f32>) -> complex<f32>
+// CHECK-DAG: @__ocml_cexp_f64(complex<f64>) -> complex<f64>
+
+//CHECK-LABEL: @abs_caller
+func.func @abs_caller(%f: complex<f32>, %d: complex<f64>) -> (f32, f64) {
+  // CHECK: %[[RF:.*]] = call @__ocml_cabs_f32(%{{.*}})
+  %rf = complex.abs %f : complex<f32>
+  // CHECK: %[[RD:.*]] = call @__ocml_cabs_f64(%{{.*}})
+  %rd = complex.abs %d : complex<f64>
+  // CHECK: return %[[RF]], %[[RD]]
+  return %rf, %rd : f32, f64
+}
+
+//CHECK-LABEL: @exp_caller
+func.func @exp_caller(%f: complex<f32>, %d: complex<f64>) -> (complex<f32>, complex<f64>) {
+  // CHECK: %[[EF:.*]] = call @__ocml_cexp_f32(%{{.*}})
+  %ef = complex.exp %f : complex<f32>
+  // CHECK: %[[ED:.*]] = call @__ocml_cexp_f64(%{{.*}})
+  %ed = complex.exp %d : complex<f64>
+  // CHECK: return %[[EF]], %[[ED]]
+  return %ef, %ed : complex<f32>, complex<f64>
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
index 4af7061a4f8a3..58719e75b1bde 100644
--- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
@@ -54,7 +54,7 @@ func.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
 // CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32,
 // CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
index d68a02b54e967..0d3da815529e3 100644
--- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
@@ -56,7 +56,7 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
 // CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32,
 // CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index c2f760b29afc4..05b41a8233e8c 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -96,7 +96,7 @@ func.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
 // CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32
 // CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index 8de6c2283b37c..2bfee03892d10 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -60,7 +60,7 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
 // CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
+// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
 // CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32
 // CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
 
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 6d55583f8bc7c..0d2fd245af9e2 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6>
   func.return %0 : vector<8xi6>
 }
+
+// -----
+
+func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) {
+  // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}}
+  amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16>
+  func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 51f3bbd9ae45c..5559ac8f1a5c3 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -493,3 +493,14 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16
   %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16>
   func.return %0 : vector<4xf16>
 }
+
+// CHECK-LABEL: func @gather_to_lds
+func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) {
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}]
+  // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}],          %{{.*}}[%{{.*}}, %{{.*}}]
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1]        : vector<2xf16>, memref<32x32xf16>, memref<32xf16,    #gpu.address_space<workgroup>>
+  amdgpu.gather_to_lds %mem1[%idx1],        %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>,    memref<32x32xf16, #gpu.address_space<workgroup>>
+  func.return
+}
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 0848a924b9d96..c53667a98cfbe 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -292,3 +292,26 @@ func.func @slicing_test_multiple_return(%arg0: index) -> (index, index) {
   %0:2 = "slicing-test-op"(%arg0, %arg0): (index, index) -> (index, index)
   return %0#0, %0#1 : index, index
 }
+
+// -----
+
+// FWD-LABEL: graph_region_with_cycle
+// BWD-LABEL: graph_region_with_cycle
+// FWDBWD-LABEL: graph_region_with_cycle
+func.func @graph_region_with_cycle() {
+  test.isolated_graph_region {
+    // FWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // FWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    
+    // BWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // BWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    %0 = "slicing-test-op"(%1) : (i1) -> i1
+    %1 = "slicing-test-op"(%0) : (i1) -> i1
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 076f3a99cd393..3d5a46d13e59d 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -1940,6 +1940,18 @@ func.func @bitcastPoisonFPtoI() -> i32 {
 
 // -----
 
+// CHECK-LABEL: func @bitcastChain(
+//  CHECK-SAME:     %[[arg:.*]]: i16)
+//       CHECK:   %[[cast:.*]] = arith.bitcast %[[arg]] : i16 to f16
+//       CHECK:   return %[[cast]]
+func.func @bitcastChain(%arg: i16) -> f16 {
+  %0 = arith.bitcast %arg : i16 to bf16
+  %1 = arith.bitcast %0 : bf16 to f16
+  return %1 : f16
+}
+
+// -----
+
 // CHECK-LABEL: test_maxsi
 // CHECK-DAG: %[[C0:.+]] = arith.constant 42
 // CHECK-DAG: %[[MAX_INT_CST:.+]] = arith.constant 127
diff --git a/mlir/test/Dialect/LLVMIR/ifunc.mlir b/mlir/test/Dialect/LLVMIR/ifunc.mlir
new file mode 100644
index 0000000000000..33e21bab0d51b
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/ifunc.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt %s -split-input-file --verify-roundtrip | FileCheck %s
+
+// CHECK: llvm.mlir.ifunc external @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.mlir.ifunc @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: llvm.mlir.ifunc private @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.mlir.ifunc private @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: llvm.mlir.ifunc weak @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.mlir.ifunc weak @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index bd1106e304c60..7f2c8c72e5cf9 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1931,3 +1931,30 @@ llvm.func @invalid_xevm_matrix_3(%a: !llvm.ptr<1>, %base_width_a: i32, %base_hei
   llvm.return %loaded_a : vector<8xi16>
 }
 
+// -----
+
+llvm.func external @resolve_foo() -> !llvm.ptr attributes {dso_local}
+// expected-error@+1 {{'llvm.mlir.ifunc' op resolver must be a definition}}
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+
+// -----
+
+llvm.mlir.global external @resolve_foo() : !llvm.ptr
+// expected-error@+1 {{'llvm.mlir.ifunc' op must have a function resolver}}
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+
+// -----
+
+llvm.func external @resolve_foo() -> !llvm.ptr
+// expected-error@+1 {{'llvm.mlir.ifunc' op 'common' linkage not supported in ifuncs}}
+llvm.mlir.ifunc common @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+
+// -----
+
+llvm.mlir.global external @resolve_foo() : !llvm.ptr
+llvm.mlir.alias external @alias_resolver : !llvm.ptr {
+  %0 = llvm.mlir.addressof @resolve_foo : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+// expected-error@+1 {{'llvm.mlir.ifunc' op must have a function resolver}}
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @alias_resolver {dso_local}
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 9e501affdd2a5..4fc39e220f86d 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -840,6 +840,99 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
+// -----
+
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.mmt4d
+///----------------------------------------------------------------------------------------
+
+func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>)
+               outs(%C_in: memref<16x16x8x8xf32>)
+  return
+}
+
+// CHECK-LABEL:   func.func @mmt4d(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) {
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32>
+// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32>
+// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @mmt4d_scalable(%A: memref<16x16x8x1xf32>, %B: memref<16x16x?x1xf32>, %C_in: memref<16x16x8x?xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x?x1xf32>)
+               outs(%C_in: memref<16x16x8x?xf32>)
+  return
+}
+// CHECK-LABEL:   func.func @mmt4d_scalable(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<16x16x?x1xf32>,
+// CHECK-SAME:      %[[C_IN:.*]]: memref<16x16x8x?xf32>) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 16 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[DIM_2:.*]] = memref.dim %[[B]], %[[C2]] : memref<16x16x?x1xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_1:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_2]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x[4]x1xi1>
+// CHECK:           %[[VEC_B:.*]] = vector.mask %[[MASK_1]] { vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32> } : vector<16x16x[4]x1xi1> -> vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_2:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[C8]], %[[DIM_2]] : vector<16x16x8x[4]xi1>
+// CHECK:           %[[VAL_15:.*]] = vector.mask %[[MASK_2]] { vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32> } : vector<16x16x8x[4]xi1> -> vector<16x16x8x[4]xf32>
+// CHECK:           %[[VAL_16:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[MASK_3:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[C8]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x16x8x[4]x1xi1>
+// CHECK:           %[[VAL_18:.*]] = vector.mask %[[MASK_3]] { vector.multi_reduction <add>, %[[VAL_16]], %[[VAL_15]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32> } : vector<16x16x16x8x[4]x1xi1> -> vector<16x16x8x[4]xf32>
+// CHECK:           vector.mask %[[MASK_2]] { vector.transfer_write %[[VAL_18]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32> } : vector<16x16x8x[4]xi1>
+
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d vector_sizes [16, 16, 16, 8, [4], 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @mmt4d_scalable_with_assume(%A: memref<16x16x8x1xf32>, %B: memref<16x16x?x1xf32>, %C_in: memref<16x16x8x?xf32>) {
+  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x?x1xf32>)
+               outs(%C_in: memref<16x16x8x?xf32>)
+  return
+}
+// CHECK-LABEL:   func.func @mmt4d_scalable_with_assume(
+// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>,
+// CHECK-SAME:      %[[B:.*]]: memref<16x16x?x1xf32>,
+// CHECK-SAME:      %[[C_IN:.*]]: memref<16x16x8x?xf32>) {
+// CHECK-NOT:       mask
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VAL_13:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32>
+// CHECK:           %[[VAL_14:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VAL_15:.*]] = vector.multi_reduction <add>, %[[VAL_14]], %[[VAL_13]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32>
+// CHECK:           vector.transfer_write %[[VAL_15]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %mmt4d vector_sizes [16, 16, 16, 8, [4], 1] {assume_dynamic_dims_match_vec_sizes} : !transform.any_op
+    transform.yield
+  }
+}
+
 ///----------------------------------------------------------------------------------------
 /// Tests for other Ops
 ///----------------------------------------------------------------------------------------
@@ -1094,30 +1187,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// -----
-
-func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) {
-  linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>)
-               outs(%C_in: memref<16x16x8x8xf32>)
-  return
-}
-
-// CHECK-LABEL:   func.func @mmt4d(
-// CHECK-SAME:      %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) {
-// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
-// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32>
-// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32>
-// CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32>
-// CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32>
-// CHECK:           vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %mmt4d : !transform.any_op
-    transform.yield
-  }
-}
 
 // -----
 
@@ -1158,6 +1227,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<?x?xf32>,
 func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x16x2xf32>) -> tensor<?x?xf32> {
 // CHECK: %[[C0:.*]] = arith.constant 0
 // CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?x?xf32>
@@ -1175,9 +1245,8 @@ func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: t
 // CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
 // CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
 // CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32>
-// CHECK: %[[empt0:.*]] = tensor.empty
 // CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
-// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]]
+// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[ARG_0]]
 // CHECK: return %[[write0]]
  %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
  return %ret : tensor<?x?xf32>
@@ -1193,6 +1262,8 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-LABEL: func @test_vectorize_unpack
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
 func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
     // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
     // CHECK: %[[C0:.*]]= arith.constant 0 : index
@@ -1201,15 +1272,14 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2
     // CHECK: %[[C32:.*]] = arith.constant 32 : index
     // CHECK: %[[C16:.*]] = arith.constant 16 : index
     // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1>
-    // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32>
+    // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] { vector.transfer_read %[[SRC]]{{.*}}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32>
     // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32>
     // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32>
-    // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
     // CHECK: %[[C01:.*]] = arith.constant 0 : index
     // CHECK: %[[C256:.*]] = arith.constant 256 : index
     // CHECK: %[[C128:.*]] = arith.constant 128 : index
     // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1>
-    // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32>
+    // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] { vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<512x128xi1> -> tensor<256x128xf32>
     // CHECK: return %[[WRIT]] : tensor<256x128xf32>
    %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
    return %0 : tensor<256x128xf32>
@@ -1225,15 +1295,16 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2
 // -----
 
 // CHECK-LABEL: func @test_vectorize_unpack_no_masks
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
 func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> 
   // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
   // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
   // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> 
   // CHECK: return %[[WRIT]] : tensor<256x128xf32>
    %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
    return %0 : tensor<256x128xf32>
@@ -1248,16 +1319,17 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
 
   // -----
 
-  // CHECK-LABEL: test_vectorize_unpack_with_outer_perm
+// CHECK-LABEL: test_vectorize_unpack_with_outer_perm
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
   func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> 
   // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
   // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
   // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> 
   // CHECK: return %[[WRIT]] : tensor<256x128xf32>
    %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
    return %0 : tensor<256x128xf32>
@@ -1327,15 +1399,17 @@ module attributes {transform.with_named_sequence} {
 
   // -----
 
+// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x8x32x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<256x128xf32>
 func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> 
   // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
   // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
   // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> 
   // CHECK: return %[[WRIT]] : tensor<256x128xf32>
    %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
    return %0 : tensor<256x128xf32>
@@ -1350,15 +1424,17 @@ func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>,
 
   // -----
 
+// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes_slice_output
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x4x16x16xf32>
+// CHECK-SAME:      %[[DEST:.*]]: tensor<64x127xf32>
 func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> {
   //      CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
   //      CHECK: %[[C0:.*]] = arith.constant 0 : index
-  //      CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32>
+  //      CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32>
   //      CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32>
   //      CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<4x16x8x16xf32> to vector<64x128xf32>
-  //      CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<64x127xf32>
   //      CHECK: %[[C00:.*]] = arith.constant 0 : index
-  //      CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[EMPT]]{{\[}}%[[C00]], %[[C00]]]
+  //      CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]
   // CHECK-SAME:  {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32>
   //      CHECK: return %[[WRIT]] : tensor<64x127xf32>
    %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32>
@@ -1374,18 +1450,20 @@ func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x
 
 // -----
 
+// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes_permute
+// CHECK-SAME:      %[[SRC:.*]]:  tensor<4x7x4xf32>
+// CHECK-SAME:      %[[DEST:.*]]:  tensor<7x16xf32>
 func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> {
    %0 = linalg.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32>
    return %0 : tensor<7x16xf32>
  }
   // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<4x7x4xf32>, vector<4x7x4xf32>
+  // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<4x7x4xf32>, vector<4x7x4xf32>
   // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 0, 2] : vector<4x7x4xf32> to vector<7x4x4xf32>
   // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<7x4x4xf32> to vector<7x16xf32>
-  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<7x16xf32>
   // CHECK: %[[C00:.*]] = arith.constant 0 : index
-  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<7x16xf32>, tensor<7x16xf32>
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<7x16xf32>, tensor<7x16xf32> 
   // CHECK: return %[[WRIT]] : tensor<7x16xf32>
  module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 704cdaf838f45..fa803efa1d910 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -962,6 +962,24 @@ func.func @test_store_zero_results2(%x: i32, %p: memref<i32>) {
 
 // -----
 
+func.func @invalid_load_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'memref.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  %val = memref.load %memref[%c0] { alignment = -1 } : memref<4xi32>
+  return
+}
+
+// -----
+
+func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: i32) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'memref.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  memref.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32>
+  return
+}
+
+// -----
+
 func.func @test_alloc_memref_map_rank_mismatch() {
 ^bb0:
   // expected-error@+1 {{memref layout mismatch between rank and affine map: 2 != 1}}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index e11de7bec2d0a..6c2298a3f8acb 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -265,6 +265,17 @@ func.func @zero_dim_no_idx(%arg0 : memref<i32>, %arg1 : memref<i32>, %arg2 : mem
   // CHECK: memref.store %{{.*}}, %{{.*}}[] : memref<i32>
 }
 
+
+// CHECK-LABEL: func @load_store_alignment
+func.func @load_store_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // CHECK: memref.load {{.*}} {alignment = 16 : i64}
+  %val = memref.load %memref[%c0] { alignment = 16 } : memref<4xi32>
+  // CHECK: memref.store {{.*}} {alignment = 16 : i64}
+  memref.store %val, %memref[%c0] { alignment = 16 } : memref<4xi32>
+  return
+}
+
 // CHECK-LABEL: func @memref_view(%arg0
 func.func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = memref.alloc() : memref<2048xi8>
diff --git a/mlir/test/Dialect/Mesh/simplifications.mlir b/mlir/test/Dialect/Mesh/simplifications.mlir
index 2540fbf9510c4..e955f4c134259 100644
--- a/mlir/test/Dialect/Mesh/simplifications.mlir
+++ b/mlir/test/Dialect/Mesh/simplifications.mlir
@@ -165,3 +165,15 @@ func.func @all_reduce_arith_minsi_endomorphism(
   // CHECK: return %[[ALL_REDUCE_RES]]
   return %2 : tensor<5xi32>
 }
+
+// Ensure this case without endomorphism op not crash.
+// CHECK-LABEL: func.func @no_endomorphism_op
+func.func @no_endomorphism_op(%arg0: tensor<2xi64>) -> i64 {
+  %c0 = arith.constant 0 : index
+  %c1_i64 = arith.constant 1 : i64
+  // CHECK: tensor.extract
+  %extracted = tensor.extract %arg0[%c0] : tensor<2xi64>
+  // CHECK: arith.maxsi
+  %0 = arith.maxsi %extracted, %c1_i64 : i64
+  return %0 : i64
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 7608ad57c7967..5088f2dfa7d7a 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2993,3 +2993,27 @@ llvm.func @invalid_mapper(%0 : !llvm.ptr) {
   }
   llvm.return
 }
+
+// -----
+func.func @invalid_allocate_align_1(%arg0 : memref<i32>) -> () {
+  // expected-error @below {{failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}}
+  omp.allocate_dir (%arg0 : memref<i32>) align(-1)
+
+  return
+}
+
+// -----
+func.func @invalid_allocate_align_2(%arg0 : memref<i32>) -> () {
+  // expected-error @below {{must be power of 2}}
+  omp.allocate_dir (%arg0 : memref<i32>) align(3)
+
+  return
+}
+
+// -----
+func.func @invalid_allocate_allocator(%arg0 : memref<i32>) -> () {
+  // expected-error @below {{invalid clause value}}
+  omp.allocate_dir (%arg0 : memref<i32>) allocator(omp_small_cap_mem_alloc)
+
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 47cfc5278a5d0..4c50ed3230976 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -3197,3 +3197,36 @@ func.func @omp_workshare_loop_wrapper_attrs(%idx : index) {
   }
   return
 }
+
+// CHECK-LABEL: func.func @omp_allocate_dir(
+// CHECK-SAME: %[[ARG0:.*]]: memref<i32>,
+// CHECK-SAME: %[[ARG1:.*]]: memref<i32>) {
+func.func @omp_allocate_dir(%arg0 : memref<i32>, %arg1 : memref<i32>) -> () {
+
+  // Test with one data var
+  // CHECK: omp.allocate_dir(%[[ARG0]] : memref<i32>)
+  omp.allocate_dir (%arg0 : memref<i32>)
+
+  // Test with two data vars
+  // CHECK: omp.allocate_dir(%[[ARG0]], %[[ARG1]] : memref<i32>, memref<i32>)
+  omp.allocate_dir (%arg0, %arg1: memref<i32>, memref<i32>)
+
+  // Test with one data var and align clause
+  // CHECK: omp.allocate_dir(%[[ARG0]] : memref<i32>) align(2)
+  omp.allocate_dir (%arg0 : memref<i32>) align(2)
+
+  // Test with one data var and allocator clause
+  // CHECK: omp.allocate_dir(%[[ARG0]] : memref<i32>) allocator(omp_pteam_mem_alloc)
+  omp.allocate_dir (%arg0 : memref<i32>) allocator(omp_pteam_mem_alloc)
+
+  // Test with one data var, align clause and allocator clause
+  // CHECK: omp.allocate_dir(%[[ARG0]] : memref<i32>) align(2) allocator(omp_thread_mem_alloc)
+  omp.allocate_dir (%arg0 : memref<i32>) align(2) allocator(omp_thread_mem_alloc)
+
+  // Test with two data vars, align clause and allocator clause
+  // CHECK: omp.allocate_dir(%[[ARG0]], %[[ARG1]] : memref<i32>, memref<i32>) align(2) allocator(omp_cgroup_mem_alloc)
+  omp.allocate_dir (%arg0, %arg1 : memref<i32>, memref<i32>) align(2) allocator(omp_cgroup_mem_alloc)
+
+  return
+}
+
diff --git a/mlir/test/Dialect/SPIRV/IR/availability.mlir b/mlir/test/Dialect/SPIRV/IR/availability.mlir
index 9c8665b1e4bbe..f56bc3967b4b7 100644
--- a/mlir/test/Dialect/SPIRV/IR/availability.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/availability.mlir
@@ -292,3 +292,17 @@ func.func @set_mesh_outputs(%0 : i32, %1 : i32) -> () {
   spirv.EXT.SetMeshOutputs %0, %1 : i32, i32
   spirv.Return
 }
+
+//===----------------------------------------------------------------------===//
+// Replicated Composite Constant op
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: constant_composite_replicate
+func.func @constant_composite_replicate() -> () {
+  // CHECK: min version: v1.0
+  // CHECK: max version: v1.6
+  // CHECK: extensions: [ [SPV_EXT_replicated_composites] ]
+  // CHECK: capabilities: [ [ReplicatedCompositesEXT] ]
+  %0 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : vector<2xi32>
+  spirv.Return
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
index 207549afdda94..99ad2a8a2e64b 100644
--- a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
@@ -163,6 +163,51 @@ func.func @coop_matrix_const_wrong_type() -> () {
     return
 }
 
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.EXT.ConstantCompositeReplicate
+//===----------------------------------------------------------------------===//
+
+func.func @ccr_result_not_composite() -> () {
+  // expected-error @+1 {{op result #0 must be vector of bool or 8/16/32/64-bit integer or 16/32/64-bit float or BFloat16 values of length 2/3/4/8/16 or any SPIR-V array type or any SPIR-V runtime array type or any SPIR-V struct type or any SPIR-V cooperative matrix type or any SPIR-V matrix type or any SPIR-V tensorArm type, but got 'i32'}}
+  %0 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : i32
+  return
+}
+
+// -----
+
+func.func @ccr_wrong_splat_type() -> () {
+  // expected-error @+1 {{expected value attribute type 'f32', but got: 'i32'}}
+  %0 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : vector<2xf32>
+  return
+}
+
+// -----
+
+func.func @ccr_wrong_splat_type() -> () {
+  // expected-error @+1 {{expected value attribute type '!spirv.array<3 x i32>' or 'i32', but got: 'vector<2xi32>'}}
+  %0 = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x !spirv.array<3 x i32>>
+  return
+}
+
+// -----
+
+func.func @ccr_wrong_splat_type() -> () {
+  // expected-error @+1 {{expected value attribute type 'f32', but got: 'i32'}}
+  %0 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.arm.tensor<2x3xf32>
+  return
+}
+
+// -----
+
+func.func @ccr_wrong_splat_type() -> () {
+  // expected-error @+1 {{expected value attribute type 'vector<3xi32>' or 'i32', but got: 'vector<2xi32>'}}
+  %0 = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<3xi32>>
+  return
+}
+
+
 // -----
 
 //===----------------------------------------------------------------------===//
@@ -854,6 +899,48 @@ spirv.module Logical GLSL450 {
   spirv.SpecConstantComposite @scc (@sc1) : !spirv.coopmatrix<8x16xf32, Device, MatrixA>
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.EXT.SpecConstantCompositeReplicate
+//===----------------------------------------------------------------------===//
+
+// -----
+
+spirv.module Logical GLSL450 {
+  // expected-error @+1 {{result type must be a composite type, but provided 'i32'}}
+  spirv.EXT.SpecConstantCompositeReplicate @sccr (@sc_i32_1) : i32
+}
+
+// -----
+
+spirv.module Logical GLSL450 {
+  // expected-error @+1 {{splat spec constant reference defining constituent not found}}
+  spirv.EXT.SpecConstantCompositeReplicate @sccr (@sc_f32_1) : !spirv.array<3 x i32>
+}
+
+// -----
+
+spirv.module Logical GLSL450 {
+  spirv.SpecConstant @sc_f32_1 = 1.0 : f32
+  // expected-error @+1 {{constituent has incorrect type: expected 'i32', but provided 'f32'}}
+  spirv.EXT.SpecConstantCompositeReplicate @sccr (@sc_f32_1) : !spirv.array<3 x i32>
+}
+
+// -----
+
+spirv.module Logical GLSL450 {
+  spirv.SpecConstant @sc_f32_1 = 1.0 : f32
+  // expected-error @+1 {{constituent has incorrect type: expected 'i32', but provided 'f32'}}
+  spirv.EXT.SpecConstantCompositeReplicate @sccr (@sc_f32_1) : !spirv.struct<(i32, i32, i32)>
+}
+
+// -----
+
+spirv.module Logical GLSL450 {
+  spirv.SpecConstant @sc_f32_1 = 1.0 : f32
+  // expected-error @+1 {{constituent has incorrect type: expected 'i32', but provided 'f32'}}
+  spirv.EXT.SpecConstantCompositeReplicate @sccr (@sc_f32_1) : !spirv.arm.tensor<2x3xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.SpecConstantOperation
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 27280807b0282..11c8d54fda055 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -1338,3 +1338,14 @@ func.func @no_fold_mul_result_exceeds_i32() -> tensor<i32> {
     %3 = tosa.mul %0, %1, %2 : (tensor<i32>, tensor<i32>, tensor<1xi8>) -> tensor<i32>
     return %3 : tensor<i32>
 }
+
+// -----
+
+// CHECK-LABEL: @test_fold_i1_to_i32_cast
+// CHECK: %[[OUT:.*]] = "tosa.const"() <{values = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: return %[[OUT]] : tensor<i32>
+func.func @test_fold_i1_to_i32_cast() -> tensor<i32> {
+  %0 = "tosa.const"() <{values = dense<1> : tensor<i1>}> : () -> tensor<i1>
+  %1 = "tosa.cast"(%0) : (tensor<i1>) -> tensor<i32>
+  return %1 : tensor<i32>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index e280c1155f526..d0f40279421f4 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -1032,6 +1032,15 @@ func.func @transpose_conv2d_strided(%arg0: tensor<1x5x7x1xf32>, %arg1: tensor<1x
 
 // -----
 
+// CHECK-LABEL: @transpose_conv2d_dynamic_out_channels
+func.func @transpose_conv2d_dynamic_out_channels(%arg0: tensor<2x1x1x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) {
+  // CHECK: -> tensor<2x3x6x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x1x1x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<2x3x6x?xf32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @resize_int_horizontal
 func.func @resize_int_horizontal(%arg0: tensor<1x15x13x1xi8>) {
   %scale = tosa.const_shape { values = dense<[11, 7, 89, 6]> : tensor<4xindex> } : () -> !tosa.shape<4>
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 12187dd18012b..ea2343efd246e 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -1379,6 +1379,21 @@ func.func @extract_strided_broadcast4(%arg0: f32) -> vector<1x4xf32> {
 
 // -----
 
+// Check the case where the same dimension is both broadcasted and sliced 
+// CHECK-LABEL: func @extract_strided_broadcast5
+//  CHECK-SAME: (%[[ARG:.+]]: vector<2x1xf32>)
+//       CHECK: %[[V:.+]] = vector.broadcast %[[ARG]] : vector<2x1xf32> to vector<2x4xf32>
+//       CHECK: return %[[V]]
+func.func @extract_strided_broadcast5(%arg0: vector<2x1xf32>) -> vector<2x4xf32> {
+ %0 = vector.broadcast %arg0 : vector<2x1xf32> to vector<2x8xf32>
+ %1 = vector.extract_strided_slice %0
+      {offsets = [0, 4], sizes = [2, 4], strides = [1, 1]}
+      : vector<2x8xf32> to vector<2x4xf32>
+  return %1 : vector<2x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: consecutive_shape_cast
 //       CHECK:   %[[C:.*]] = vector.shape_cast %{{.*}} : vector<16xf16> to vector<4x4xf16>
 //  CHECK-NEXT:   return %[[C]] : vector<4x4xf16>
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 5038646e1f026..8017140a0bfab 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1995,6 +1995,15 @@ func.func @vector_load(%src : memref<?xi8>) {
 
 // -----
 
+func.func @invalid_load_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'vector.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  %val = vector.load %memref[%c0] { alignment = -1 } : memref<4xi32>, vector<4xi32>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // vector.store
 //===----------------------------------------------------------------------===//
@@ -2005,3 +2014,12 @@ func.func @vector_store(%dest : memref<?xi8>, %vec : vector<16x16xi8>) {
   vector.store %vec, %dest[%c0] : memref<?xi8>, vector<16x16xi8>
   return
 }
+
+// -----
+
+func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // expected-error @below {{'vector.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}}
+  vector.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32>, vector<4xi32>
+  return
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 10bf0f1620568..39578ac56e369 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -853,6 +853,16 @@ func.func @vector_load_and_store_2d_vector_memref(%memref : memref<200x100xvecto
   return
 }
 
+// CHECK-LABEL: func @load_store_alignment
+func.func @load_store_alignment(%memref: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  // CHECK: vector.load {{.*}} {alignment = 16 : i64}
+  %val = vector.load %memref[%c0] { alignment = 16 } : memref<4xi32>, vector<4xi32>
+  // CHECK: vector.store {{.*}} {alignment = 16 : i64}
+  vector.store %val, %memref[%c0] { alignment = 16 } : memref<4xi32>, vector<4xi32>
+  return
+}
+
 // CHECK-LABEL: @masked_load_and_store
 func.func @masked_load_and_store(%base: memref<?xf32>, %mask: vector<16xi1>, %passthru: vector<16xf32>) {
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 9fa9d56e4a324..ae8fce786ee57 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -531,6 +531,35 @@ func.func @warp_scf_for_swap_no_yield(%arg0: index) {
   return
 }
 
+// -----
+// scf.for result is not distributed in this case.
+// CHECK-PROP-LABEL:   func @warp_scf_for_broadcasted_result(
+// CHECK-PROP:  %[[W0:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
+// CHECK-PROP:    %[[INI:.*]] = "some_def"() : () -> vector<1xf32>
+// CHECK-PROP:    gpu.yield %[[INI]] : vector<1xf32>
+// CHECK-PROP:  }
+// CHECK-PROP:  %[[F:.*]] = scf.for {{.*}} iter_args(%[[ARG2:.*]] = %[[W0]]) -> (vector<1xf32>) {
+// CHECK-PROP:    %[[W1:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[ARG2]] : vector<1xf32>) -> (vector<1xf32>) {
+// CHECK-PROP:    ^bb0(%{{.*}}: vector<1xf32>):
+// CHECK-PROP:      %[[T0:.*]] = "some_op"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
+// CHECK-PROP:      gpu.yield %[[T0]] : vector<1xf32>
+// CHECK-PROP:    }
+// CHECK-PROP:    scf.yield %[[W1]] : vector<1xf32>
+func.func @warp_scf_for_broadcasted_result(%arg0: index) -> vector<1xf32> {
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %2 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<1xf32>) {
+    %ini = "some_def"() : () -> (vector<1xf32>)
+    %0 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<1xf32>) {
+      %1 = "some_op"(%arg4) : (vector<1xf32>) -> (vector<1xf32>)
+      scf.yield %1 : vector<1xf32>
+    }
+    gpu.yield %0 : vector<1xf32>
+  }
+  return %2 : vector<1xf32>
+}
+
 // -----
 
 #map = affine_map<()[s0] -> (s0 * 4)>
@@ -584,6 +613,85 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
   return
 }
 
+// -----
+// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_for_result(
+//       CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP:  %[[INI0:.*]] = "some_def"() : () -> vector<128xf32>
+//       CHECK-PROP:  %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
+//       CHECK-PROP:  gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP:  %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP:    %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32>
+//       CHECK-PROP:    %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32>
+//       CHECK-PROP:    gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32>
+//       CHECK-PROP:  }
+//       CHECK-PROP:  scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
+func.func @warp_scf_for_unused_for_result(%arg0: index) {
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
+    %ini = "some_def"() : () -> (vector<128xf32>)
+    %ini1 = "some_def"() : () -> (vector<128xf32>)
+    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) {
+      %add = arith.addi %arg3, %c1 : index
+      %1  = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>)
+      %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
+      scf.yield %acc, %1 : vector<128xf32>, vector<128xf32>
+    }
+    gpu.yield %3#0 : vector<128xf32>
+  }
+  "some_use"(%0) : (vector<4xf32>) -> ()
+  return
+}
+
+// -----
+// CHECK-PROP-LABEL: func.func @warp_scf_for_swapped_for_results(
+//       CHECK-PROP:  %[[W0:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8xf32>, vector<4xf32>, vector<4xf32>) {
+//  CHECK-PROP-NEXT:    %[[INI0:.*]] = "some_def"() : () -> vector<256xf32>
+//  CHECK-PROP-NEXT:    %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
+//  CHECK-PROP-NEXT:    %[[INI2:.*]] = "some_def"() : () -> vector<128xf32>
+//  CHECK-PROP-NEXT:    gpu.yield %[[INI0]], %[[INI1]], %[[INI2]] : vector<256xf32>, vector<128xf32>, vector<128xf32>
+//  CHECK-PROP-NEXT:  }
+//  CHECK-PROP-NEXT:  %[[F0:.*]]:3 = scf.for {{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1, %{{.*}} = %[[W0]]#2) -> (vector<8xf32>, vector<4xf32>, vector<4xf32>) {
+//  CHECK-PROP-NEXT:    %[[W1:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} :
+//  CHECK-PROP-SAME:        vector<8xf32>, vector<4xf32>, vector<4xf32>) -> (vector<8xf32>, vector<4xf32>, vector<4xf32>) {
+//  CHECK-PROP-NEXT:      ^bb0(%{{.*}}: vector<256xf32>, %{{.*}}: vector<128xf32>, %{{.*}}: vector<128xf32>):
+//  CHECK-PROP-NEXT:        %[[T3:.*]] = "some_def_1"(%{{.*}}) : (vector<256xf32>) -> vector<256xf32>
+//  CHECK-PROP-NEXT:        %[[T4:.*]] = "some_def_2"(%{{.*}}) : (vector<128xf32>) -> vector<128xf32>
+//  CHECK-PROP-NEXT:        %[[T5:.*]] = "some_def_3"(%{{.*}}) : (vector<128xf32>) -> vector<128xf32>
+//  CHECK-PROP-NEXT:        gpu.yield %[[T3]], %[[T4]], %[[T5]] : vector<256xf32>, vector<128xf32>, vector<128xf32>
+//  CHECK-PROP-NEXT:    }
+//  CHECK-PROP-NEXT:    scf.yield %[[W1]]#0, %[[W1]]#1, %[[W1]]#2 : vector<8xf32>, vector<4xf32>, vector<4xf32>
+//  CHECK-PROP-NEXT:  }
+//  CHECK-PROP-NEXT:  "some_use_1"(%[[F0]]#2) : (vector<4xf32>) -> ()
+//  CHECK-PROP-NEXT:  "some_use_2"(%[[F0]]#1) : (vector<4xf32>) -> ()
+//  CHECK-PROP-NEXT:  "some_use_3"(%[[F0]]#0) : (vector<8xf32>) -> ()
+func.func @warp_scf_for_swapped_for_results(%arg0: index) {
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0:3 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>, vector<8xf32>) {
+    %ini1 = "some_def"() : () -> (vector<256xf32>)
+    %ini2 = "some_def"() : () -> (vector<128xf32>)
+    %ini3 = "some_def"() : () -> (vector<128xf32>)
+    %3:3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2, %arg6 = %ini3) -> (vector<256xf32>, vector<128xf32>, vector<128xf32>) {
+      %acc1 = "some_def_1"(%arg4) : (vector<256xf32>) -> (vector<256xf32>)
+      %acc2 = "some_def_2"(%arg5) : (vector<128xf32>) -> (vector<128xf32>)
+      %acc3 = "some_def_3"(%arg6) : (vector<128xf32>) -> (vector<128xf32>)
+      scf.yield %acc1, %acc2, %acc3 : vector<256xf32>, vector<128xf32>, vector<128xf32>
+    }
+    gpu.yield %3#2, %3#1, %3#0 : vector<128xf32>, vector<128xf32>, vector<256xf32>
+  }
+  "some_use_1"(%0#0) : (vector<4xf32>) -> ()
+  "some_use_2"(%0#1) : (vector<4xf32>) -> ()
+  "some_use_3"(%0#2) : (vector<8xf32>) -> ()
+  return
+}
+
 // -----
 
 // CHECK-PROP-LABEL: func @vector_reduction(
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 83a98ab0622b7..eb564d55bfd51 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
 // -----
-func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+func.func @create_nd_tdesc_1(%src: memref<24xf32>) {
   // expected-error@+1 {{Expecting the TensorDesc rank is not greater than the ranks of shape, strides, offsets or the memref source}}
   %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
@@ -9,47 +9,62 @@ func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) {
 
 // -----
 
-func.func @create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
+func.func @create_nd_tdesc_2(%src: memref<24x32xf32>) {
   // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_3(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{SLM is only supported for 1D block tensor}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_4(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_5(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_6(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_7(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>>
   return
 }
 
+// -----
+func.func @create_nd_tdesc_8(%src: ui64) {
+  // expected-error@+1 {{'xegpu.create_nd_tdesc' op Expecting strides and shape to be present for integer source}}
+  %1 = xegpu.create_nd_tdesc %src : ui64-> !xegpu.tensor_desc<128x128xf32>
+  return
+}
+
+// -----
+func.func @create_nd_tdesc_9(%src: ui64) {
+  // expected-error@+1 {{expected mixed offsets rank to match mixed sizes rank}}
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : ui64-> !xegpu.tensor_desc<128x128xf32>
+  return
+}
+
+
 // -----
 func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 3bfe1fa81aa6e..695437354cd7c 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -17,8 +17,8 @@ gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) {
 gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]],  %[[arg4]]], shape : [%[[arg2]], %[[arg1]]], strides : [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides: [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
@@ -62,6 +62,47 @@ gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) {
 }
 
 
+// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>) 
+gpu.func @test_create_nd_tdesc_7(%src: ui64, %w : index, %h : index, %x : index, %y : index, %src2: memref<24x32xf32>) {
+  //CHECK: %[[C:.*]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg5]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %src2 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ 
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_8(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) 
+gpu.func @test_create_nd_tdesc_8(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+  
+  %c1 = arith.constant 1 : index   
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  %2 = xegpu.create_nd_tdesc %src, shape : [%h, %w], strides : [%w, %c1]  : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ 
+  gpu.return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}}) 
+
+gpu.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
+
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[%arg3, %arg4], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides:[%w, %c1]  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+  gpu.return
+}
+
+// CHECK-LABEL: func @test_create_nd_tdesc_10({{.*}}) 
+gpu.func @test_create_nd_tdesc_10(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {  
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16> 
+  %2 = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides:[%w, %c1]  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+
+  gpu.return
+}
+
 // CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @prefetch_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 3d91b2269bc4b..e78ae4a17710b 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -95,10 +95,10 @@ gpu.module @test {
 // -----
 // CHECK-LABEL: gpu.func @load_dpas_store
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
 // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
 // CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK: xegpu.store_nd %[[T4]], %[[T5]]  : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
@@ -120,10 +120,10 @@ gpu.module @test {
 // -----
 // CHECK-LABEL: gpu.func @load_dpas_postop_store
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
 // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
 // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
 // CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
 // CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
@@ -150,16 +150,16 @@ gpu.module @test {
 // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
 // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
 // CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
 // CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
 // CHECK: xegpu.store_nd %[[T1]], %[[T2]]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
 gpu.module @test {
   gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
     %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %1 = xegpu.load_nd %0  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir
new file mode 100644
index 0000000000000..e8f0d61a75960
--- /dev/null
+++ b/mlir/test/IR/test-pattern-logging-listener.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s --test-walk-pattern-rewrite-driver \
+// RUN:   --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s
+
+// Check that when replacing an op with a new op, we get appropriate
+// pattern-logging lines. The use of check same is to avoid the complexity of
+// matching the anonymous namespace prefix, which can be one of {anonymous} vs
+// {anonymous_namespace} vs `anonymous_namespace` (and maybe others?) on the
+// various platforms.
+
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationInserted | test.new_op
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi
+// CHECK: [pattern-logging-listener]
+// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op
+func.func @replace_with_new_op() -> i32 {
+  %a = "test.replace_with_new_op"() : () -> (i32)
+  %res = arith.addi %a, %a : i32
+  return %res : i32
+}
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index d09373bdb3f14..7b0a8494a8acb 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -193,33 +193,33 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_tileable_consumer_scf_forall_multi_yielding_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x64xf32>, %arg3: tensor<64x32xf32>) -> (tensor<64x64xf32>, tensor<2048xf32>) {
-      %c4 = arith.constant 4 : index
-      %c64 = arith.constant 64 : index
-      %c0 = arith.constant 0 : index
-      %0:2 = scf.forall (%arg4, %arg5) in (2, 2) shared_outs(%arg6 = %arg3, %arg7 = %arg2) -> (tensor<64x32xf32>, tensor<64x64xf32>) {
-        %extracted_slice = tensor.extract_slice %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-        %extracted_slice_0 = tensor.extract_slice %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x64xf32> to tensor<32x32xf32>
-        %6 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) -> tensor<32x32xf32>
-        scf.forall.in_parallel {
-          tensor.parallel_insert_slice %6 into %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x64xf32>
-          tensor.parallel_insert_slice %extracted_slice_0 into %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-        }
+  func.func @fuse_tileable_consumer_scf_forall_multi_yielding_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x64xf32>, %arg3: tensor<64x32xf32>) -> (tensor<64x64xf32>, tensor<2048xf32>) {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %0:2 = scf.forall (%arg4, %arg5) in (2, 2) shared_outs(%arg6 = %arg3, %arg7 = %arg2) -> (tensor<64x32xf32>, tensor<64x64xf32>) {
+      %extracted_slice = tensor.extract_slice %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %extracted_slice_0 = tensor.extract_slice %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x64xf32> to tensor<32x32xf32>
+      %6 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %6 into %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x64xf32>
+        tensor.parallel_insert_slice %extracted_slice_0 into %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
       }
-      %1 = tensor.empty() : tensor<64x64xf32>
-      %2 = tensor.empty() : tensor<64x64xf32>
-      %3 = tensor.empty() : tensor<64x64xf32>
-      %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%0#1, %1 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2, %3 : tensor<64x64xf32>, tensor<64x64xf32>) {
-      ^bb0(%in: f32, %in_0: f32, %out: f32, %out_1: f32):
-        %6 = arith.mulf %in, %in_0 : f32
-        %7 = arith.subf %out, %6 : f32
-        %8 = arith.addf %out_1, %in : f32
-        linalg.yield %7, %8 : f32, f32
-      } -> (tensor<64x64xf32>, tensor<64x64xf32>)
-      %5 = tensor.empty() : tensor<2048xf32>
-      %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32>
-      return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32>
     }
+    %1 = tensor.empty() : tensor<64x64xf32>
+    %2 = tensor.empty() : tensor<64x64xf32>
+    %3 = tensor.empty() : tensor<64x64xf32>
+    %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%0#1, %1 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2, %3 : tensor<64x64xf32>, tensor<64x64xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32, %out_1: f32):
+      %6 = arith.mulf %in, %in_0 : f32
+      %7 = arith.subf %out, %6 : f32
+      %8 = arith.addf %out_1, %in : f32
+      linalg.yield %7, %8 : f32, f32
+    } -> (tensor<64x64xf32>, tensor<64x64xf32>)
+    %5 = tensor.empty() : tensor<2048xf32>
+    %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32>
+    return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32>
+  }
 }
 
 module attributes {transform.with_named_sequence} {
@@ -269,38 +269,38 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2048xf32> {
-        %c4 = arith.constant 4 : index
-        %c64 = arith.constant 64 : index
-        %c0 = arith.constant 0 : index
-        %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
-            %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-            %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
-                ^bb0(%in: f32, %in_16: f32, %out: f32):
-                %13 = arith.mulf %in, %in_16 : f32
-                %14 = arith.addf %out, %13 : f32
-                linalg.yield %14 : f32
-            } -> tensor<32x32xf32>
-            scf.forall.in_parallel {
-                tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-            }
-        }
-        %output = tensor.empty() : tensor<2048xf32>
-        %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32>
-        return %unpack : tensor<2048xf32>
+  func.func @fuse_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2048xf32> {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+      %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+        ^bb0(%in: f32, %in_16: f32, %out: f32):
+        %13 = arith.mulf %in, %in_16 : f32
+        %14 = arith.addf %out, %13 : f32
+        linalg.yield %14 : f32
+      } -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+      }
     }
+    %output = tensor.empty() : tensor<2048xf32>
+    %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32>
+    return %unpack : tensor<2048xf32>
+  }
 }
-  
+
 module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
-        %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %loop = transform.structured.match ops{["scf.forall"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
-        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.yield
-    }
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
+    : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 //  CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
 //  CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2048)>
@@ -332,38 +332,38 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_unaligned_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2047xf32> {
-        %c4 = arith.constant 4 : index
-        %c64 = arith.constant 64 : index
-        %c0 = arith.constant 0 : index
-        %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
-            %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-            %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
-                ^bb0(%in: f32, %in_16: f32, %out: f32):
-                %13 = arith.mulf %in, %in_16 : f32
-                %14 = arith.addf %out, %13 : f32
-                linalg.yield %14 : f32
-            } -> tensor<32x32xf32>
-            scf.forall.in_parallel {
-                tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-            }
-        }
-        %output = tensor.empty() : tensor<2047xf32>
-        %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32>
-        return %unpack : tensor<2047xf32>
+  func.func @fuse_unaligned_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2047xf32> {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+      %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+        ^bb0(%in: f32, %in_16: f32, %out: f32):
+        %13 = arith.mulf %in, %in_16 : f32
+        %14 = arith.addf %out, %13 : f32
+        linalg.yield %14 : f32
+      } -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+      }
     }
+    %output = tensor.empty() : tensor<2047xf32>
+    %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32>
+    return %unpack : tensor<2047xf32>
+  }
 }
-  
+
 module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
-        %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %loop = transform.structured.match ops{["scf.forall"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
-        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.yield
-    }
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
+    : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 //  CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
 //  CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2047)>
@@ -395,46 +395,46 @@ module attributes {transform.with_named_sequence} {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-    func.func @fuse_pack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> {
-        %c4 = arith.constant 4 : index
-        %c64 = arith.constant 64 : index
-        %c0 = arith.constant 0 : index
-        %1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
-            %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
-            %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
-                ^bb0(%in: f32, %in_16: f32, %out: f32):
-                %13 = arith.mulf %in, %in_16 : f32
-                %14 = arith.addf %out, %13 : f32
-                linalg.yield %14 : f32
-            } -> tensor<32x32xf32>
-            scf.forall.in_parallel {
-                tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
-            }
-        }
-        %output = tensor.empty() : tensor<4x32x16xf32>
-        %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32>
-        return %pack : tensor<4x32x16xf32>
+  func.func @fuse_perfect_tiling_pack_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> {
+    %c4 = arith.constant 4 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %1 = scf.forall (%arg3, %arg4) in (2, 1) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+      %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+        ^bb0(%in: f32, %in_16: f32, %out: f32):
+        %13 = arith.mulf %in, %in_16 : f32
+        %14 = arith.addf %out, %13 : f32
+        linalg.yield %14 : f32
+      } -> tensor<32x32xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+      }
     }
+    %output = tensor.empty() : tensor<4x32x16xf32>
+    %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32>
+    return %pack : tensor<4x32x16xf32>
+  }
 }
-  
+
 module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
-        %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %loop = transform.structured.match ops{["scf.forall"]} in %arg1
-        : (!transform.any_op) -> !transform.any_op
-        %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
-        : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.yield
-    }
+  transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+    %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %loop = transform.structured.match ops{["scf.forall"]} in %arg1
+    : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_consumer %slice_op in (%loop)
+    : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
 }
 //      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
-//      CHECK: func.func @fuse_pack_consumer_into_scf_forall(
+//      CHECK: func.func @fuse_perfect_tiling_pack_consumer(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x32xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x32xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<64x32xf32>)
 //      CHECK:   %[[OUT_INIT:.*]] = tensor.empty() : tensor<4x32x16xf32>
-//      CHECK:   %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2)
+//      CHECK:   %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 1)
 // CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG2]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]])
 // CHECK-SAME:   {
 //      CHECK:      %[[GENERIC_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
@@ -451,6 +451,223 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// It is valid to fuse the pack op in perfect tiling scenario when the dimension
+// is dynamic and padding is not needed.
+
+func.func @fuse_pack_consumer_with_no_pad_dynamic_dim(%arg0: tensor<64x?xf32>, %arg1: tensor<64x?xf32>, %1: tensor<64x?x16xf32>) -> tensor<64x?x16xf32> {
+  %c1 = arith.constant 1 : index
+  %d1 = tensor.dim %arg0, %c1 : tensor<64x?xf32>
+  %0 = scf.forall (%arg2) = (0) to (%d1) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x?xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x?xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x?xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x?xf32>
+    }
+  }
+  %pack = linalg.pack %0 inner_dims_pos = [1] inner_tiles = [16] into %1 : tensor<64x?xf32> -> tensor<64x?x16xf32>
+  return %pack : tensor<64x?x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_pack_consumer_with_no_pad_dynamic_dim(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (%{{.+}}) step (16)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[ARG2]])
+//      CHECK:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0] [64, 1, 16] [1, 1, 1]
+//      CHECK:      %[[PACK:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        inner_dims_pos = [1] inner_tiles = [16]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[ELEM]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0] [64, 1, 16] [1, 1, 1]
+
+// -----
+
+// It is valid to fuse the pack op with padding semantics if the tiled
+// dimensions do not need padding.
+
+func.func @fuse_pack_consumer_with_padding_semantics(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<22x2x3x16xf32> {
+  %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+    }
+  }
+  %1 = tensor.empty() : tensor<22x2x3x16xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<22x2x3x16xf32>
+  return %pack : tensor<22x2x3x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_pack_consumer_with_padding_semantics(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
+//  CHECK-DAG:   %[[OUT_INIT:.*]] = tensor.empty() : tensor<22x2x3x16xf32>
+//  CHECK-DAG:   %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]])
+//      CHECK:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [22, 1, 3, 16] [1, 1, 1, 1]
+//      CHECK:      %[[TILED_PACK_OUT:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        padding_value(%[[PAD_VAL]] : f32)
+// CHECK-SAME:        inner_dims_pos = [0, 1] inner_tiles = [3, 16]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [22, 1, 3, 16] [1, 1, 1, 1]
+
+// -----
+
+// It is valid to fuse the pack if the dimension is not tiled even when it needs
+// extra padding.
+
+func.func @fuse_pack_consumer_with_untiled_extra_padding(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<33x2x3x16xf32> {
+  %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+    }
+  }
+  %1 = tensor.empty() : tensor<33x2x3x16xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<33x2x3x16xf32>
+  return %pack : tensor<33x2x3x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_pack_consumer_with_untiled_extra_padding(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
+//  CHECK-DAG:   %[[OUT_INIT:.*]] = tensor.empty() : tensor<33x2x3x16xf32>
+//  CHECK-DAG:   %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32
+//      CHECK:   %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]])
+//      CHECK:      %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:      %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME:        ins(%[[ELEM_SRC]]
+// CHECK-SAME:        outs(%[[ELEM_DEST]]
+//  CHECK-DAG:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+//  CHECK-DAG:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [33, 1, 3, 16] [1, 1, 1, 1]
+//      CHECK:      %[[TILED_PACK_OUT:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME:        padding_value(%[[PAD_VAL]] : f32)
+// CHECK-SAME:        inner_dims_pos = [0, 1] inner_tiles = [3, 16]
+// CHECK-SAME:        into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [33, 1, 3, 16] [1, 1, 1, 1]
+
+// -----
+
+// If the dimension is tiled and it needs extra padding, do not fuse the pack
+// op.
+
+func.func @nofuse_pack_consumer_with_extra_padding(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<23x32x3x16xf32> {
+  %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) {
+    %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+    %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+    scf.forall.in_parallel {
+      // expected-error @below {{failed to fuse consumer of slice}}
+      tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+    }
+  }
+  %1 = tensor.empty() : tensor<23x32x3x16xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<23x32x3x16xf32>
+  return %pack : tensor<23x32x3x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
+// Imperfect tiling is not supported in pack op consumer fusion.
+
+#map = affine_map<(d0) -> (d0 * 5)>
+#map1 = affine_map<(d0) -> (d0)>
+func.func @nofuse_pack_with_imperfect_tiling(%arg0: tensor<30xf32>) -> tensor<5x6xf32> {
+  %0 = tensor.empty() : tensor<30xf32>
+  %1 = scf.forall (%arg1) in (6) shared_outs(%arg2 = %0) -> (tensor<30xf32>) {
+    %3 = affine.apply #map(%arg1)
+    %extracted_slice = tensor.extract_slice %arg0[%3] [5] [1] : tensor<30xf32> to tensor<5xf32>
+    %extracted_slice_0 = tensor.extract_slice %arg2[%3] [5] [1] : tensor<30xf32> to tensor<5xf32>
+    %4 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel"]} ins(%extracted_slice : tensor<5xf32>) outs(%extracted_slice_0 : tensor<5xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %5 = arith.addf %in, %in : f32
+      linalg.yield %5 : f32
+    } -> tensor<5xf32>
+    scf.forall.in_parallel {
+      // expected-error @below {{failed to fuse consumer of slice}}
+      tensor.parallel_insert_slice %4 into %arg2[%3] [5] [1] : tensor<5xf32> into tensor<30xf32>
+    }
+  }
+  %2 = tensor.empty() : tensor<5x6xf32>
+  %pack = linalg.pack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [6] into %2 : tensor<30xf32> -> tensor<5x6xf32>
+  return %pack : tensor<5x6xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
 module {
   func.func @fuse_add_multiple_tilable_consumers(%arg0: tensor<256x256xf32>, %arg1: tensor<256x256xf32>, %arg2: tensor<256x256xf32>) -> (tensor<256x256xf32>, tensor<256x256xf32>) {
     %c0 = arith.constant 0 : index
@@ -489,7 +706,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32>
 //      CHECK:   %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32>
 //      CHECK:   %[[LOOP_RESULT:.*]]:3 = scf.for %[[IV1:.*]] = %[[C0]]
-// CHECK-SAME:       iter_args(%[[FIRST_OUT_ARG:.*]] = %[[dest0]], %[[SECOND_OUT_ARG:.*]] = %[[dest0]], %[[THIRD_OUT_ARG:.*]] = %[[dest0]]) 
+// CHECK-SAME:       iter_args(%[[FIRST_OUT_ARG:.*]] = %[[dest0]], %[[SECOND_OUT_ARG:.*]] = %[[dest0]], %[[THIRD_OUT_ARG:.*]] = %[[dest0]])
 // CHECK-SAME:   {
 //      CHECK:          %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], 0] [64, 256] [1, 1]
 //      CHECK:          %[[ADD_INS0_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 256] [1, 1]
@@ -645,7 +862,7 @@ func.func @multi_slice_fusion1(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %generic#0 into %init0[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
       tensor.parallel_insert_slice %generic#1 into %init1[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
-    }	
+    }
   }
   %empty = tensor.empty(%dim0) : tensor<?xf32>
   %result = linalg.generic {
@@ -719,7 +936,7 @@ func.func @multi_slice_fusion2(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %
     scf.forall.in_parallel {
       tensor.parallel_insert_slice %generic0 into %init0[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
       tensor.parallel_insert_slice %generic1 into %init1[%iv0] [%tilesize] [1] : tensor<?xf32> into tensor<?xf32>
-    }	
+    }
   }
   %empty = tensor.empty(%dim0) : tensor<?xf32>
   %result = linalg.generic {
diff --git a/mlir/test/Target/LLVMIR/Import/ifunc.ll b/mlir/test/Target/LLVMIR/Import/ifunc.ll
new file mode 100644
index 0000000000000..0cec205dfce68
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/Import/ifunc.ll
@@ -0,0 +1,63 @@
+; RUN: mlir-translate --import-llvm %s --split-input-file | FileCheck %s
+
+; CHECK: llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+@foo = dso_local ifunc void (ptr, i32), ptr @resolve_foo
+
+define dso_local void @call_foo(ptr noundef %0, i32 noundef %1) {
+; CHECK: llvm.call @foo
+  call void @foo(ptr noundef %0, i32 noundef %1)
+  ret void
+}
+
+define dso_local ptr @foo_fptr() {
+; CHECK: [[FPTR:%[0-9]+]] = llvm.mlir.addressof @foo
+; CHECK: llvm.return [[FPTR]]
+  ret ptr @foo
+}
+
+define internal ptr @resolve_foo() {
+  ret ptr @foo_1
+}
+
+declare void @foo_1(ptr noundef, i32 noundef)
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+@resolver_alias = alias ptr (), ptr @resolver
+@resolver_alias_alias = alias ptr (), ptr @resolver_alias
+
+; CHECK-DAG: llvm.mlir.ifunc external @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias
+@ifunc = ifunc float (i64), ptr @resolver_alias
+; CHECK-DAG: llvm.mlir.ifunc external @ifunc2 : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias_alias
+@ifunc2 = ifunc float (i64), ptr @resolver_alias_alias
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+; CHECK: llvm.mlir.ifunc linkonce_odr hidden @ifunc
+@ifunc = linkonce_odr hidden ifunc float (i64), ptr @resolver
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+; CHECK: llvm.mlir.ifunc private @ifunc {{.*}} {dso_local}
+@ifunc = private dso_local ifunc float (i64), ptr @resolver
+
+; // -----
+
+define ptr @resolver() {
+  ret ptr inttoptr (i64 333 to ptr)
+}
+
+; CHECK: llvm.mlir.ifunc weak @ifunc
+@ifunc = weak ifunc float (i64), ptr @resolver
diff --git a/mlir/test/Target/LLVMIR/ifunc.mlir b/mlir/test/Target/LLVMIR/ifunc.mlir
new file mode 100644
index 0000000000000..bba306c1e1ab3
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/ifunc.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-translate -mlir-to-llvmir %s --split-input-file | FileCheck %s
+
+// CHECK: @foo = dso_local ifunc void (ptr, i32), ptr @resolve_foo
+llvm.mlir.ifunc external @foo : !llvm.func<void (ptr, i32)>, !llvm.ptr @resolve_foo {dso_local}
+llvm.func @call_foo(%arg0: !llvm.ptr {llvm.noundef}, %arg1: i32 {llvm.noundef}) attributes {dso_local} {
+// CHECK: call void @foo
+  llvm.call @foo(%arg0, %arg1) : (!llvm.ptr {llvm.noundef}, i32 {llvm.noundef}) -> ()
+  llvm.return
+}
+llvm.func @foo_fptr() -> !llvm.ptr attributes {dso_local} {
+  %1 = llvm.mlir.addressof @foo : !llvm.ptr
+// CHECK: ret ptr @foo
+  llvm.return %1 : !llvm.ptr
+}
+llvm.func internal @resolve_foo() -> !llvm.ptr attributes {dso_local} {
+  %0 = llvm.mlir.addressof @foo_1 : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+llvm.func @foo_1(!llvm.ptr {llvm.noundef}, i32 {llvm.noundef})
+
+// -----
+
+llvm.mlir.alias external @resolver_alias : !llvm.func<ptr ()> {
+  %0 = llvm.mlir.addressof @resolver : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+llvm.mlir.alias external @resolver_alias_alias : !llvm.func<ptr ()> {
+  %0 = llvm.mlir.addressof @resolver_alias : !llvm.ptr
+  llvm.return %0 : !llvm.ptr
+}
+
+// CHECK-DAG: @ifunc = ifunc float (i64), ptr @resolver_alias
+// CHECK-DAG: @ifunc2 = ifunc float (i64), ptr @resolver_alias_alias
+llvm.mlir.ifunc external @ifunc2 : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias_alias
+llvm.mlir.ifunc external @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver_alias
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: @ifunc = linkonce_odr hidden ifunc
+llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: @ifunc = private ifunc
+llvm.mlir.ifunc private @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver {dso_local}
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
+
+// -----
+
+// CHECK: @ifunc = weak ifunc
+llvm.mlir.ifunc weak @ifunc : !llvm.func<f32 (i64)>, !llvm.ptr @resolver
+llvm.func @resolver() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(333 : i64) : i64
+  %1 = llvm.inttoptr %0 : i64 to !llvm.ptr
+  llvm.return %1 : !llvm.ptr
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-composite-simd-if.mlir b/mlir/test/Target/LLVMIR/openmp-composite-simd-if.mlir
new file mode 100644
index 0000000000000..8f2e8ef06608d
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-composite-simd-if.mlir
@@ -0,0 +1,97 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @_QPfoo(%arg0: !llvm.ptr {fir.bindc_name = "array", llvm.nocapture}, %arg1: !llvm.ptr {fir.bindc_name = "t", llvm.nocapture}) {
+  %0 = llvm.mlir.constant(0 : i64) : i32
+  %1 = llvm.mlir.constant(1 : i32) : i32
+  %2 = llvm.mlir.constant(10 : i64) : i64
+  %3 = llvm.mlir.constant(1 : i64) : i64
+  %4 = llvm.alloca %3 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  %5 = llvm.load %arg1 : !llvm.ptr -> i32
+  %6 = llvm.icmp "ne" %5, %0 : i32
+  %7 = llvm.trunc %2 : i64 to i32
+  omp.wsloop {
+    omp.simd if(%6) {
+      omp.loop_nest (%arg2) : i32 = (%1) to (%7) inclusive step (%1) {
+        llvm.store %arg2, %4 : i32, !llvm.ptr
+        %8 = llvm.load %4 : !llvm.ptr -> i32
+        %9 = llvm.sext %8 : i32 to i64
+        %10 = llvm.getelementptr %arg0[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+        llvm.store %8, %10 : i32, !llvm.ptr
+        omp.yield
+      }
+    } {omp.composite}
+  } {omp.composite}
+  llvm.return
+}
+
+// CHECK-LABEL: @_QPfoo
+// ...
+// CHECK:       omp_loop.preheader:                               ; preds =
+// CHECK:         store i32 0, ptr %[[LB_ADDR:.*]], align 4
+// CHECK:         store i32 9, ptr %[[UB_ADDR:.*]], align 4
+// CHECK:         store i32 1, ptr %[[STEP_ADDR:.*]], align 4
+// CHECK:         %[[VAL_15:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[VAL_15]], i32 34, ptr %{{.*}}, ptr %[[LB_ADDR]], ptr %[[UB_ADDR]], ptr %[[STEP_ADDR]], i32 1, i32 0)
+// CHECK:         %[[LB:.*]] = load i32, ptr %[[LB_ADDR]], align 4
+// CHECK:         %[[UB:.*]] = load i32, ptr %[[UB_ADDR]], align 4
+// CHECK:         %[[VAL_18:.*]] = sub i32 %[[UB]], %[[LB]]
+// CHECK:         %[[COUNT:.*]] = add i32 %[[VAL_18]], 1
+// CHECK:         br label %[[OMP_LOOP_HEADER:.*]]
+// CHECK:       omp_loop.header:                                  ; preds = %[[OMP_LOOP_INC:.*]], %[[OMP_LOOP_PREHEADER:.*]]
+// CHECK:         %[[IV:.*]] = phi i32 [ 0, %[[OMP_LOOP_PREHEADER]] ], [ %[[NEW_IV:.*]], %[[OMP_LOOP_INC]] ]
+// CHECK:         br label %[[OMP_LOOP_COND:.*]]
+// CHECK:       omp_loop.cond:                                    ; preds = %[[OMP_LOOP_HEADER]]
+// CHECK:         %[[VAL_25:.*]] = icmp ult i32 %[[IV]], %[[COUNT]]
+// CHECK:         br i1 %[[VAL_25]], label %[[OMP_LOOP_BODY:.*]], label %[[OMP_LOOP_EXIT:.*]]
+// CHECK:       omp_loop.body:                                    ; preds = %[[OMP_LOOP_COND]]
+// CHECK:         %[[VAL_28:.*]] = add i32 %[[IV]], %[[LB]]
+// This is the IF clause:
+// CHECK:         br i1 %{{.*}}, label %[[SIMD_IF_THEN:.*]], label %[[SIMD_IF_ELSE:.*]]
+
+// CHECK:       simd.if.then:                                     ; preds = %[[OMP_LOOP_BODY]]
+// CHECK:         %[[VAL_29:.*]] = mul i32 %[[VAL_28]], 1
+// CHECK:         %[[VAL_30:.*]] = add i32 %[[VAL_29]], 1
+// CHECK:         br label %[[VAL_33:.*]]
+// CHECK:       omp.loop_nest.region:                             ; preds = %[[SIMD_IF_THEN]]
+// This version contains !llvm.access.group metadata for SIMD
+// CHECK:         store i32 %[[VAL_30]], ptr %{{.*}}, align 4, !llvm.access.group !1
+// CHECK:         %[[VAL_34:.*]] = load i32, ptr %{{.*}}, align 4, !llvm.access.group !1
+// CHECK:         %[[VAL_35:.*]] = sext i32 %[[VAL_34]] to i64
+// CHECK:         %[[VAL_36:.*]] = getelementptr i32, ptr %[[VAL_37:.*]], i64 %[[VAL_35]]
+// CHECK:         store i32 %[[VAL_34]], ptr %[[VAL_36]], align 4, !llvm.access.group !1
+// CHECK:         br label %[[OMP_REGION_CONT3:.*]]
+// CHECK:       omp.region.cont3:                                 ; preds = %[[VAL_33]]
+// CHECK:         br label %[[SIMD_PRE_LATCH:.*]]
+
+// CHECK:       simd.pre_latch:                                   ; preds = %[[OMP_REGION_CONT3]], %[[OMP_REGION_CONT35:.*]]
+// CHECK:         br label %[[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:                                     ; preds = %[[SIMD_PRE_LATCH]]
+// CHECK:         %[[NEW_IV]] = add nuw i32 %[[IV]], 1
+// CHECK:         br label %[[OMP_LOOP_HEADER]], !llvm.loop !2
+
+// CHECK:       simd.if.else:                                     ; preds = %[[OMP_LOOP_BODY]]
+// CHECK:         br label %[[SIMD_IF_ELSE2:.*]]
+// CHECK:       simd.if.else5:
+// CHECK:         %[[MUL:.*]] = mul i32 %[[VAL_28]], 1
+// CHECK:         %[[ADD:.*]] = add i32 %[[MUL]], 1
+// CHECK:         br label %[[LOOP_NEST_REGION:.*]]
+// CHECK:       omp.loop_nest.region6:                            ; preds = %[[SIMD_IF_ELSE2]]
+// No llvm.access.group metadata for else clause
+// CHECK:         store i32 %[[ADD]], ptr %{{.*}}, align 4
+// CHECK:         %[[VAL_42:.*]] = load i32, ptr %{{.*}}, align 4
+// CHECK:         %[[VAL_43:.*]] = sext i32 %[[VAL_42]] to i64
+// CHECK:         %[[VAL_44:.*]] = getelementptr i32, ptr %[[VAL_37]], i64 %[[VAL_43]]
+// CHECK:         store i32 %[[VAL_42]], ptr %[[VAL_44]], align 4
+// CHECK:         br label %[[OMP_REGION_CONT35]]
+// CHECK:       omp.region.cont37:                                ; preds = %[[LOOP_NEST_REGION]]
+// CHECK:         br label %[[SIMD_PRE_LATCH]]
+
+// CHECK:       omp_loop.exit:                                    ; preds = %[[OMP_LOOP_COND]]
+// CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_15]])
+// CHECK:         %[[VAL_45:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_45]])
+
+// CHECK: !1 = distinct !{}
+// CHECK: !2 = distinct !{!2, !3}
+// CHECK: !3 = !{!"llvm.loop.parallel_accesses", !1}
+// CHECK-NOT: llvm.loop.vectorize
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 32f0ba5b105ff..3f4dcd5e24c56 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -820,8 +820,6 @@ llvm.func @simd_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fi
 }
 // Be sure that llvm.loop.vectorize.enable metadata appears twice
 // CHECK: llvm.loop.parallel_accesses
-// CHECK-NEXT: llvm.loop.vectorize.enable
-// CHECK: llvm.loop.vectorize.enable
 
 // -----
 
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
index 11c8559044be0..08473f243bcf1 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -637,9 +637,12 @@ llvm.func @wsloop_simd_reduction(%lb : i64, %ub : i64, %step : i64) {
 // Outlined function.
 // CHECK: define internal void @[[OUTLINED]]
 
-// Private reduction variable and its initialization.
+// reduction variable in wsloop
 // CHECK: %[[PRIVATE:.+]] = alloca float
+// reduction variable in simd
+// CHECK: %[[PRIVATE2:.+]] = alloca float
 // CHECK: store float 0.000000e+00, ptr %[[PRIVATE]]
+// CHECK: store float 0.000000e+00, ptr %[[PRIVATE2]]
 
 // Call to the reduction function.
 // CHECK: call i32 @__kmpc_reduce
@@ -659,9 +662,9 @@ llvm.func @wsloop_simd_reduction(%lb : i64, %ub : i64, %step : i64) {
 
 // Update of the private variable using the reduction region
 // (the body block currently comes after all the other blocks).
-// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE]]
+// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE2]]
 // CHECK: %[[UPDATED:.+]] = fadd float 2.000000e+00, %[[PARTIAL]]
-// CHECK: store float %[[UPDATED]], ptr %[[PRIVATE]]
+// CHECK: store float %[[UPDATED]], ptr %[[PRIVATE2]]
 
 // Reduction function.
 // CHECK: define internal void @[[REDFUNC]]
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index e2c32b254c200..2fa4470bb8300 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -489,43 +489,3 @@ llvm.func @wsloop_order(%lb : i32, %ub : i32, %step : i32) {
   }
   llvm.return
 }
-
-// -----
-
-llvm.func @do_simd_if(%1 : !llvm.ptr, %5 : i32, %4 : i32, %6 : i1) {
-  omp.wsloop {
-    // expected-warning@below {{simd information on composite construct discarded}}
-    omp.simd if(%6) {
-      omp.loop_nest (%arg0) : i32 = (%5) to (%4) inclusive step (%5) {
-        llvm.store %arg0, %1 : i32, !llvm.ptr
-        omp.yield
-      }
-    } {omp.composite}
-  } {omp.composite}
-  llvm.return
-}
-
-// -----
-
-omp.declare_reduction @add_reduction_i32 : i32 init {
-^bb0(%arg0: i32):
-  %0 = llvm.mlir.constant(0 : i32) : i32
-  omp.yield(%0 : i32)
-} combiner {
-^bb0(%arg0: i32, %arg1: i32):
-  %0 = llvm.add %arg0, %arg1 : i32
-  omp.yield(%0 : i32)
-}
-llvm.func @do_simd_reduction(%1 : !llvm.ptr, %3 : !llvm.ptr, %6 : i32, %7 : i32) {
-  omp.wsloop reduction(@add_reduction_i32 %3 -> %arg0 : !llvm.ptr) {
-    // expected-warning@below {{simd information on composite construct discarded}}
-    omp.simd reduction(@add_reduction_i32 %arg0 -> %arg1 : !llvm.ptr) {
-      omp.loop_nest (%arg2) : i32 = (%7) to (%6) inclusive step (%7) {
-        llvm.store %arg2, %1 : i32, !llvm.ptr
-        %12 = llvm.load %arg1 : !llvm.ptr -> i32
-        omp.yield
-      }
-    } {omp.composite}
-  } {omp.composite}
-  llvm.return
-}
diff --git a/mlir/test/Target/SPIRV/constant.mlir b/mlir/test/Target/SPIRV/constant.mlir
index 8d4e53418b70f..76d34c2a96e67 100644
--- a/mlir/test/Target/SPIRV/constant.mlir
+++ b/mlir/test/Target/SPIRV/constant.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --split-input-file --test-spirv-roundtrip %s | FileCheck %s
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
   // CHECK-LABEL: @bool_const
@@ -306,3 +306,106 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.ReturnValue %coop : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
   }
 }
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
+
+  // CHECK-LABEL: @splat_vector_i32
+  spirv.func @splat_vector_i32() -> (vector<3xi32>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [1 : i32] : vector<3xi32>
+    %1 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : vector<3xi32>
+    spirv.ReturnValue %1 : vector<3xi32>
+  }
+
+  // CHECK-LABEL: @splat_array_of_i32
+  spirv.func @splat_array_of_i32() -> (!spirv.array<3 x i32>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<3 x i32>
+    %1 = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<3 x i32>
+    spirv.ReturnValue %1 : !spirv.array<3 x i32>
+  }
+
+  // CHECK-LABEL: @splat_array_of_vectors_of_i32
+  spirv.func @splat_array_of_vectors_of_i32() -> (!spirv.array<3 x vector<2xi32>>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<3 x vector<2xi32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<3 x vector<2xi32>>
+    spirv.ReturnValue %0 : !spirv.array<3 x vector<2xi32>>
+  }
+
+  // CHECK-LABEL: @splat_array_of_splat_array_of_i32
+  spirv.func @splat_array_of_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+    // CHECK: %0 = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+  }
+
+  // CHECK-LABEL: @splat_array_of_non_splat_array_of_i32
+  spirv.func @splat_array_of_non_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+    // CHECK: %0 = spirv.EXT.ConstantCompositeReplicate {{\[}}[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+  }
+
+  // CHECK-LABEL: @splat_array_of_splat_vectors_of_i32
+  spirv.func @splat_array_of_splat_vectors_of_i32() -> (!spirv.array<2 x vector<2xi32>>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.array<2 x vector<2xi32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.array<2 x vector<2xi32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>>
+  }
+
+  // CHECK-LABEL: @splat_arm_tensor_of_i32
+  spirv.func @splat_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.arm.tensor<2x3xi32>
+    %0 = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.arm.tensor<2x3xi32>
+    spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32>
+  }
+
+  // CHECK-LABEL: @splat_vector_f32
+  spirv.func @splat_vector_f32() -> (vector<3xf32>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : vector<3xf32>
+    %1 = spirv.EXT.ConstantCompositeReplicate [1.0 : f32] : vector<3xf32>
+    spirv.ReturnValue %1 : vector<3xf32>
+  }
+
+  // CHECK-LABEL: @splat_array_of_f32
+  spirv.func @splat_array_of_f32() -> (!spirv.array<3 x f32>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : !spirv.array<3 x f32>
+    %1 = spirv.EXT.ConstantCompositeReplicate [1.0 : f32] : !spirv.array<3 x f32>
+    spirv.ReturnValue %1 : !spirv.array<3 x f32>
+  }
+ 
+  // CHECK-LABEL: @splat_array_of_splat_array_of_f32
+  spirv.func @splat_array_of_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+    // CHECK: %0 = spirv.EXT.ConstantCompositeReplicate [3.000000e+00 : f32] : !spirv.array<2 x !spirv.array<3 x f32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [3.0 : f32] : !spirv.array<2 x !spirv.array<3 x f32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+  }
+
+  // CHECK-LABEL: @splat_array_of_non_splat_array_of_f32
+  spirv.func @splat_array_of_non_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+    // CHECK: %0 = spirv.EXT.ConstantCompositeReplicate {{\[}}[1.000000e+00 : f32, 2.000000e+00 : f32, 3.000000e+00 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [[1.0 : f32, 2.0 : f32, 3.0 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+  }
+
+  // CHECK-LABEL: @splat_array_of_vectors_of_f32
+  spirv.func @splat_array_of_vectors_of_f32() -> (!spirv.array<3 x vector<2xf32>>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [dense<[1.000000e+00, 2.000000e+00]> : vector<2xf32>] : !spirv.array<3 x vector<2xf32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [dense<[1.0, 2.0]> : vector<2xf32>] : !spirv.array<3 x vector<2xf32>>
+    spirv.ReturnValue %0 : !spirv.array<3 x vector<2xf32>>
+  }
+
+  // CHECK-LABEL: @splat_array_of_splat_vectors_of_f32
+  spirv.func @splat_array_of_splat_vectors_of_f32() -> (!spirv.array<2 x vector<2xf32>>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.array<2 x vector<2xf32>>
+    %0 = spirv.EXT.ConstantCompositeReplicate [2.0 : f32] : !spirv.array<2 x vector<2xf32>>
+    spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>>
+  }
+
+  // CHECK-LABEL: @splat_arm_tensor_of_f32
+  spirv.func @splat_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" {
+    // CHECK: spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.arm.tensor<2x3xf32>
+    %0 = spirv.EXT.ConstantCompositeReplicate [2.0 : f32] : !spirv.arm.tensor<2x3xf32>
+    spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xf32>
+  }
+}
diff --git a/mlir/test/Target/SPIRV/spec-constant.mlir b/mlir/test/Target/SPIRV/spec-constant.mlir
index 078d77125b3fd..f434956ab34a3 100644
--- a/mlir/test/Target/SPIRV/spec-constant.mlir
+++ b/mlir/test/Target/SPIRV/spec-constant.mlir
@@ -88,6 +88,33 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
 // -----
 
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
+
+  spirv.SpecConstant @sc_i32_1 = 1 : i32
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_i32 (@sc_i32_1) : !spirv.array<3 x i32>
+  spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_i32 (@sc_i32_1) : !spirv.array<3 x i32>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_i32 (@sc_i32_1) : !spirv.struct<(i32, i32, i32)>
+  spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_i32 (@sc_i32_1) : !spirv.struct<(i32, i32, i32)>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_i32 (@sc_i32_1) : vector<3xi32>
+  spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_i32 (@sc_i32_1) : vector<3 x i32>
+
+  spirv.SpecConstant @sc_f32_1 = 1.0 : f32
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_f32 (@sc_f32_1) : !spirv.array<3 x f32>
+  spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_f32 (@sc_f32_1) : !spirv.array<3 x f32>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_f32 (@sc_f32_1) : !spirv.struct<(f32, f32, f32)>
+  spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_f32 (@sc_f32_1) : !spirv.struct<(f32, f32, f32)>
+
+  // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_f32 (@sc_f32_1) : vector<3xf32>
+  spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_f32 (@sc_f32_1) : vector<3 x f32>
+}
+
+// -----
+
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 
   spirv.SpecConstant @sc_i32_1 = 1 : i32
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 9b5cadd62befc..233fef8ec4296 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -301,6 +301,17 @@ def find_real_python_interpreter():
             ToolSubst("mlir-opt", "mlir-opt --verify-roundtrip", unresolved="fatal"),
         ]
     )
+elif "MLIR_GENERATE_PATTERN_CATALOG" in os.environ:
+    tools.extend(
+        [
+            ToolSubst(
+                "mlir-opt",
+                "mlir-opt --debug-only=pattern-logging-listener --mlir-disable-threading",
+                unresolved="fatal",
+            ),
+            ToolSubst("FileCheck", "FileCheck --dump-input=always", unresolved="fatal"),
+        ]
+    )
 else:
     tools.extend(["mlir-opt"])
 
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
index fd678f8321fd9..694616696a9e2 100644
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -361,7 +361,9 @@ def testCustomAttribute():
         try:
             TestAttr(42)
         except TypeError as e:
-            assert "Expected an MLIR object" in str(e)
+            assert "Expected an MLIR object (got 42)" in str(e)
+        except ValueError as e:
+            assert "Cannot cast attribute to TestAttr (from 42)" in str(e)
         else:
             raise
 
@@ -406,7 +408,9 @@ def testCustomType():
         try:
             TestType(42)
         except TypeError as e:
-            assert "Expected an MLIR object" in str(e)
+            assert "Expected an MLIR object (got 42)" in str(e)
+        except ValueError as e:
+            assert "Cannot cast type to TestType (from 42)" in str(e)
         else:
             raise
 
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index b08fe98397fbc..ede1571f940f6 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -686,6 +686,15 @@ def testOperationPrint():
         skip_regions=True,
     )
 
+    # Test print with large_resource_limit.
+    # CHECK: func.func @f1(%arg0: i32) -> i32
+    # CHECK-NOT: resource1: "0x08
+    module.operation.print(large_resource_limit=2)
+
+    # Test large_elements_limit has no effect on resource string
+    # CHECK: func.func @f1(%arg0: i32) -> i32
+    # CHECK: resource1: "0x08
+    module.operation.print(large_elements_limit=2)
 
 # CHECK-LABEL: TEST: testKnownOpView
 @run
diff --git a/mlir/test/python/lib/PythonTestModuleNanobind.cpp b/mlir/test/python/lib/PythonTestModuleNanobind.cpp
index dd0a9f2b66ea6..108df8da86a7e 100644
--- a/mlir/test/python/lib/PythonTestModuleNanobind.cpp
+++ b/mlir/test/python/lib/PythonTestModuleNanobind.cpp
@@ -113,8 +113,10 @@ NB_MODULE(_mlirPythonTestNanobind, m) {
       .attr(MLIR_PYTHON_CAPI_VALUE_CASTER_REGISTER_ATTR)(
           mlirRankedTensorTypeID)(
           nanobind::cpp_function([valueCls](const nb::object &valueObj) {
-            nb::object capsule = mlirApiObjectToCapsule(valueObj);
-            MlirValue v = mlirPythonCapsuleToValue(capsule.ptr());
+            std::optional<nb::object> capsule =
+                mlirApiObjectToCapsule(valueObj);
+            assert(capsule.has_value() && "capsule is not null");
+            MlirValue v = mlirPythonCapsuleToValue(capsule.value().ptr());
             MlirType t = mlirValueGetType(v);
             // This is hyper-specific in order to exercise/test registering a
             // value caster from cpp (but only for a single test case; see
diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py
index 85d2eb304882e..e26d42bb32913 100644
--- a/mlir/test/python/pass_manager.py
+++ b/mlir/test/python/pass_manager.py
@@ -363,6 +363,63 @@ def testPrintIrLargeLimitElements():
         pm.run(module)
 
 
+# CHECK-LABEL: TEST: testPrintIrLargeResourceLimit
+@run
+def testPrintIrLargeResourceLimit():
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() -> tensor<3xi64> {
+              %0 = arith.constant dense_resource<blob1> : tensor<3xi64>
+              return %0 : tensor<3xi64>
+            }
+          }
+          {-#
+            dialect_resources: {
+              builtin: {
+                blob1: "0x010000000000000002000000000000000300000000000000"
+              }
+            }
+          #-}
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        ctx.enable_multithreading(False)
+        pm.enable_ir_printing(large_resource_limit=4)
+        # CHECK-NOT: blob1: "0x01
+        pm.run(module)
+
+
+# CHECK-LABEL: TEST: testPrintIrLargeResourceLimitVsElementsLimit
+@run
+def testPrintIrLargeResourceLimitVsElementsLimit():
+    """Test that large_elements_limit does not affect the printing of resources."""
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() -> tensor<3xi64> {
+              %0 = arith.constant dense_resource<blob1> : tensor<3xi64>
+              return %0 : tensor<3xi64>
+            }
+          }
+          {-#
+            dialect_resources: {
+              builtin: {
+                blob1: "0x010000000000000002000000000000000300000000000000"
+              }
+            }
+          #-}
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        ctx.enable_multithreading(False)
+        pm.enable_ir_printing(large_elements_limit=1)
+        # CHECK-NOT: blob1: "0x01
+        pm.run(module)
+
+
 # CHECK-LABEL: TEST: testPrintIrTree
 @run
 def testPrintIrTree():
diff --git a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
index 7df500bc9568a..dd0b09f7f05d2 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
@@ -608,3 +608,97 @@ TEST(IntegerRelationTest, convertVarKindToLocal) {
   EXPECT_EQ(space.getId(VarKind::Symbol, 0), Identifier(&identifiers[3]));
   EXPECT_EQ(space.getId(VarKind::Symbol, 1), Identifier(&identifiers[4]));
 }
+
+TEST(IntegerRelationTest, rangeProduct) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, j, k) : (2*i + 3*k == 0, i >= 0, j >= 0, k >= 0)", 2);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, j, l) : (4*i + 6*j + 9*l == 0, i >= 0, j >= 0, l >= 0)", 2);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j, k, l) : (2*i + 3*k == 0, 4*i + 6*j + 9*l == "
+                           "0, i >= 0, j >= 0, k >= 0, l >= 0)",
+                           2);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductMultdimRange) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, k, l, m) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == "
+                           "0, i >= 0, k >= 0, l >= 0, m >= 0)",
+                           1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductMultdimRangeSwapped) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1);
+  IntegerRelation r2 =
+      parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, l, m, k) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == "
+                           "0, i >= 0, k >= 0, l >= 0, m >= 0)",
+                           1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyDomain) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 0);
+  IntegerRelation r2 =
+      parseRelationFromSet("(k, l) : (2*k + 3*l == 0, k >= 0, l >= 0)", 0);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j, k, l) : (2*k + 3*l == 0, 4*i + 9*j == "
+                           "0, i >= 0, j >= 0, k >= 0, l >= 0)",
+                           0);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyRange) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 2);
+  IntegerRelation r2 =
+      parseRelationFromSet("(i, j) : (2*i + 3*j == 0, i >= 0, j >= 0)", 2);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j) : (2*i + 3*j == 0, 4*i + 9*j == "
+                           "0, i >= 0, j >= 0)",
+                           2);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyDomainAndRange) {
+  IntegerRelation r1 = parseRelationFromSet("() : ()", 0);
+  IntegerRelation r2 = parseRelationFromSet("() : ()", 0);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected = parseRelationFromSet("() : ()", 0);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductSymbols) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, j)[s] : (2*i + 3*j + s == 0, i >= 0, j >= 0)", 1);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, l)[s] : (3*i + 4*l + s == 0, i >= 0, l >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected = parseRelationFromSet(
+      "(i, j, l)[s] : (2*i + 3*j + s == 0, 3*i + 4*l + s == "
+      "0, i >= 0, j >= 0, l >= 0)",
+      1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
index aa16421cbec51..836efdb307f97 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
@@ -519,14 +519,44 @@ TEST_F(OpenACCOpsTest, routineOpTest) {
   op->removeGangDimDeviceTypeAttr();
   op->removeGangDimAttr();
 
-  op->setBindNameDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
-  op->setBindNameAttr(b.getArrayAttr({b.getStringAttr("fname")}));
+  op->setBindIdNameDeviceTypeAttr(
+      b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Host)}));
+  op->setBindStrNameDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
+  op->setBindIdNameAttr(
+      b.getArrayAttr({SymbolRefAttr::get(&context, "test_symbol")}));
+  op->setBindStrNameAttr(b.getArrayAttr({b.getStringAttr("fname")}));
   EXPECT_TRUE(op->getBindNameValue().has_value());
-  EXPECT_EQ(op->getBindNameValue().value(), "fname");
-  for (auto d : dtypesWithoutNone)
-    EXPECT_FALSE(op->getBindNameValue(d).has_value());
-  op->removeBindNameDeviceTypeAttr();
-  op->removeBindNameAttr();
+  EXPECT_TRUE(op->getBindNameValue(DeviceType::Host).has_value());
+  EXPECT_EQ(std::visit(
+                [](const auto &attr) -> std::string {
+                  if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                               mlir::StringAttr>) {
+                    return attr.str();
+                  } else {
+                    return attr.getLeafReference().str();
+                  }
+                },
+                op->getBindNameValue().value()),
+            "fname");
+  EXPECT_EQ(std::visit(
+                [](const auto &attr) -> std::string {
+                  if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                               mlir::StringAttr>) {
+                    return attr.str();
+                  } else {
+                    return attr.getLeafReference().str();
+                  }
+                },
+                op->getBindNameValue(DeviceType::Host).value()),
+            "test_symbol");
+  for (auto d : dtypesWithoutNone) {
+    if (d != DeviceType::Host)
+      EXPECT_FALSE(op->getBindNameValue(d).has_value());
+  }
+  op->removeBindIdNameDeviceTypeAttr();
+  op->removeBindStrNameDeviceTypeAttr();
+  op->removeBindIdNameAttr();
+  op->removeBindStrNameAttr();
 }
 
 template <typename Op>
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index d22afb3003e76..a46e64718dab9 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_unittest(MLIRIRTests
   AttrTypeReplacerTest.cpp
   Diagnostic.cpp
   DialectTest.cpp
+  DistinctAttributeAllocatorTest.cpp
   InterfaceTest.cpp
   IRMapping.cpp
   InterfaceAttachmentTest.cpp
diff --git a/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
new file mode 100644
index 0000000000000..99067d09f7bed
--- /dev/null
+++ b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
@@ -0,0 +1,45 @@
+//=== DistinctAttributeAllocatorTest.cpp - DistinctAttr storage alloc test ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/CrashRecoveryContext.h"
+#include <thread>
+
+using namespace mlir;
+
+//
+// Test that a DistinctAttr that is created on a separate thread does
+// not have its storage deleted when the thread joins.
+//
+TEST(DistinctAttributeAllocatorTest, TestAttributeWellFormedAfterThreadJoin) {
+  MLIRContext ctx;
+  OpBuilder builder(&ctx);
+  DistinctAttr attr;
+
+  std::thread t([&ctx, &attr]() {
+    attr = DistinctAttr::create(UnitAttr::get(&ctx));
+    ASSERT_TRUE(attr);
+  });
+  t.join();
+
+  // If the attribute storage got deleted after the thread joins (which we don't
+  // want) then trying to access it triggers an assert in Debug mode, and a
+  // crash otherwise. Run this in a CrashRecoveryContext to avoid bringing down
+  // the whole test suite if this test fails. Additionally, MSAN and/or TSAN
+  // should raise failures here if the attribute storage was deleted.
+  llvm::CrashRecoveryContext crc;
+  EXPECT_TRUE(crc.RunSafely([attr]() { (void)attr.getAbstractAttribute(); }));
+  EXPECT_TRUE(
+      crc.RunSafely([attr]() { (void)*cast<Attribute>(attr).getImpl(); }));
+
+  ASSERT_TRUE(attr);
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index c4e7f9689a900..ffc9016bca0a3 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -84,17 +84,20 @@ struct ol_program_impl_t {
         DeviceImage(DeviceImage) {}
   plugin::DeviceImageTy *Image;
   std::unique_ptr<llvm::MemoryBuffer> ImageData;
-  std::vector<std::unique_ptr<ol_symbol_impl_t>> Symbols;
+  std::mutex SymbolListMutex;
   __tgt_device_image DeviceImage;
+  llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> KernelSymbols;
+  llvm::StringMap<std::unique_ptr<ol_symbol_impl_t>> GlobalSymbols;
 };
 
 struct ol_symbol_impl_t {
-  ol_symbol_impl_t(GenericKernelTy *Kernel)
-      : PluginImpl(Kernel), Kind(OL_SYMBOL_KIND_KERNEL) {}
-  ol_symbol_impl_t(GlobalTy &&Global)
-      : PluginImpl(Global), Kind(OL_SYMBOL_KIND_GLOBAL_VARIABLE) {}
+  ol_symbol_impl_t(const char *Name, GenericKernelTy *Kernel)
+      : PluginImpl(Kernel), Kind(OL_SYMBOL_KIND_KERNEL), Name(Name) {}
+  ol_symbol_impl_t(const char *Name, GlobalTy &&Global)
+      : PluginImpl(Global), Kind(OL_SYMBOL_KIND_GLOBAL_VARIABLE), Name(Name) {}
   std::variant<GenericKernelTy *, GlobalTy> PluginImpl;
   ol_symbol_kind_t Kind;
+  llvm::StringRef Name;
 };
 
 namespace llvm {
@@ -714,32 +717,40 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
                        ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol) {
   auto &Device = Program->Image->getDevice();
 
+  std::lock_guard<std::mutex> Lock{Program->SymbolListMutex};
+
   switch (Kind) {
   case OL_SYMBOL_KIND_KERNEL: {
-    auto KernelImpl = Device.constructKernel(Name);
-    if (!KernelImpl)
-      return KernelImpl.takeError();
+    auto &Kernel = Program->KernelSymbols[Name];
+    if (!Kernel) {
+      auto KernelImpl = Device.constructKernel(Name);
+      if (!KernelImpl)
+        return KernelImpl.takeError();
 
-    if (auto Err = KernelImpl->init(Device, *Program->Image))
-      return Err;
+      if (auto Err = KernelImpl->init(Device, *Program->Image))
+        return Err;
+
+      Kernel = std::make_unique<ol_symbol_impl_t>(KernelImpl->getName(),
+                                                  &*KernelImpl);
+    }
 
-    *Symbol =
-        Program->Symbols
-            .emplace_back(std::make_unique<ol_symbol_impl_t>(&*KernelImpl))
-            .get();
+    *Symbol = Kernel.get();
     return Error::success();
   }
   case OL_SYMBOL_KIND_GLOBAL_VARIABLE: {
-    GlobalTy GlobalObj{Name};
-    if (auto Res = Device.Plugin.getGlobalHandler().getGlobalMetadataFromDevice(
-            Device, *Program->Image, GlobalObj))
-      return Res;
-
-    *Symbol = Program->Symbols
-                  .emplace_back(
-                      std::make_unique<ol_symbol_impl_t>(std::move(GlobalObj)))
-                  .get();
+    auto &Global = Program->KernelSymbols[Name];
+    if (!Global) {
+      GlobalTy GlobalObj{Name};
+      if (auto Res =
+              Device.Plugin.getGlobalHandler().getGlobalMetadataFromDevice(
+                  Device, *Program->Image, GlobalObj))
+        return Res;
+
+      Global = std::make_unique<ol_symbol_impl_t>(GlobalObj.getName().c_str(),
+                                                  std::move(GlobalObj));
+    }
 
+    *Symbol = Global.get();
     return Error::success();
   }
   default:
diff --git a/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
index 5e87ab5b29621..1f496b9c6e1ae 100644
--- a/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
+++ b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp
@@ -41,6 +41,14 @@ TEST_P(olGetSymbolKernelTest, Success) {
   ASSERT_NE(Kernel, nullptr);
 }
 
+TEST_P(olGetSymbolKernelTest, SuccessSamePtr) {
+  ol_symbol_handle_t KernelA = nullptr;
+  ol_symbol_handle_t KernelB = nullptr;
+  ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &KernelA));
+  ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &KernelB));
+  ASSERT_EQ(KernelA, KernelB);
+}
+
 TEST_P(olGetSymbolKernelTest, InvalidNullProgram) {
   ol_symbol_handle_t Kernel = nullptr;
   ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
@@ -72,6 +80,16 @@ TEST_P(olGetSymbolGlobalTest, Success) {
   ASSERT_NE(Global, nullptr);
 }
 
+TEST_P(olGetSymbolGlobalTest, SuccessSamePtr) {
+  ol_symbol_handle_t GlobalA = nullptr;
+  ol_symbol_handle_t GlobalB = nullptr;
+  ASSERT_SUCCESS(
+      olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &GlobalA));
+  ASSERT_SUCCESS(
+      olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &GlobalB));
+  ASSERT_EQ(GlobalA, GlobalB);
+}
+
 TEST_P(olGetSymbolGlobalTest, InvalidNullProgram) {
   ol_symbol_handle_t Global = nullptr;
   ASSERT_ERROR(
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
index 3993ae17f7fc4..d1ae6cc82b943 100644
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -358,7 +358,6 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
     C.num_sharing = static_cast<int>(b.count());
     C.level = cache.Level;
     C.size = cache.Size;
-    C.type = "Unknown";
     switch (cache.Type) {
       case CacheUnified:
         C.type = "Unified";
@@ -372,6 +371,9 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
       case CacheTrace:
         C.type = "Trace";
         break;
+      default:
+        C.type = "Unknown";
+        break;
     }
     res.push_back(C);
   }
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index ac4e3813fbba8..3598944381900 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -270,6 +270,12 @@ td_library(
     includes = ["include"],
 )
 
+td_library(
+    name = "BuiltinsRISCVXAndesTdFiles",
+    srcs = ["include/clang/Basic/BuiltinsRISCVXAndes.td"],
+    includes = ["include"],
+)
+
 td_library(
     name = "BuiltinsX86BaseTdFiles",
     srcs = ["include/clang/Basic/BuiltinsX86Base.td"],
@@ -348,6 +354,7 @@ gentbl_cc_library(
     td_file = "include/clang/Basic/BuiltinsRISCV.td",
     deps = [
         ":BuiltinsBaseTdFiles",
+        ":BuiltinsRISCVXAndesTdFiles",
         ":BuiltinsRISCVXCVTdFiles",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index cd608a1352a7a..e3d807a46fe6a 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1919,7 +1919,7 @@ libc_support_library(
     srcs = ["src/math/generic/common_constants.cpp"],
     hdrs = ["src/math/generic/common_constants.h"],
     deps = [
-        ":__support_fputil_triple_double",
+        ":__support_math_exp_constants",
         ":__support_number_pair",
     ],
 )
@@ -1996,16 +1996,15 @@ libc_support_library(
 
 libc_support_library(
     name = "explogxf",
-    srcs = ["src/math/generic/explogxf.cpp"],
     hdrs = ["src/math/generic/explogxf.h"],
     deps = [
-        ":__support_common",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fma",
-        ":__support_fputil_fp_bits",
         ":__support_fputil_multiply_add",
         ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
+        ":__support_math_exp_utils",
+        ":__support_math_exp10f_utils",
+        ":__support_macros_properties_cpu_features",
         ":common_constants",
     ],
 )
@@ -2050,19 +2049,6 @@ libc_support_library(
     ],
 )
 
-libc_support_library(
-    name = "exp10f_impl",
-    hdrs = ["src/math/generic/exp10f_impl.h"],
-    deps = [
-        ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_rounding_mode",
-        ":__support_macros_optimization",
-        ":common_constants",
-        ":explogxf",
-    ],
-)
-
 libc_support_library(
     name = "exp2f_impl",
     hdrs = ["src/math/generic/exp2f_impl.h"],
@@ -2205,6 +2191,91 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp_constants",
+    hdrs = ["src/__support/math/exp_constants.h"],
+    deps = [
+        ":__support_fputil_triple_double",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp_utils",
+    hdrs = ["src/__support/math/exp_utils.h"],
+    deps = [
+        ":__support_cpp_optional",
+        ":__support_cpp_bit",
+        ":__support_fputil_fp_bits",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp",
+    hdrs = ["src/__support/math/exp.h"],
+    deps = [
+        ":__support_math_exp_constants",
+        ":__support_math_exp_utils",
+        ":__support_cpp_bit",
+        ":__support_cpp_optional",
+        ":__support_fputil_dyadic_float",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_fputil_triple_double",
+        ":__support_fputil_double_double",
+        ":__support_integer_literals",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10",
+    hdrs = ["src/__support/math/exp10.h"],
+    deps = [
+        ":__support_math_exp_constants",
+        ":__support_math_exp_utils",
+        ":__support_fputil_double_double",
+        ":__support_fputil_dyadic_float",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_fputil_triple_double",
+        ":__support_integer_literals",
+        ":__support_macros_optimization",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10f_utils",
+    hdrs = ["src/__support/math/exp10f_utils.h"],
+    deps = [
+        ":__support_fputil_basic_operations",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_common",
+        ":__support_math_exp_utils",
+    ],
+)
+
+libc_support_library(
+    name = "__support_math_exp10f",
+    hdrs = ["src/__support/math/exp10f.h"],
+    deps = [
+        ":__support_math_exp10f_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+    ],
+)
+
 ############################### complex targets ################################
 
 libc_function(
@@ -2668,10 +2739,10 @@ libc_math_function(
     name = "cosf",
     additional_deps = [
         ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
         ":sincosf_utils",
+        ":errno",
     ],
 )
 
@@ -2785,17 +2856,8 @@ libc_math_function(
 libc_math_function(
     name = "exp",
     additional_deps = [
-        ":__support_fputil_double_double",
-        ":__support_fputil_dyadic_float",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
-        ":__support_fputil_triple_double",
-        ":__support_integer_literals",
-        ":__support_macros_optimization",
-        ":common_constants",
-        ":explogxf",
+        ":__support_math_exp",
+        ":errno",
     ],
 )
 
@@ -2818,24 +2880,16 @@ libc_math_function(
 libc_math_function(
     name = "exp10",
     additional_deps = [
-        ":__support_fputil_double_double",
-        ":__support_fputil_dyadic_float",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
-        ":__support_fputil_triple_double",
-        ":__support_integer_literals",
-        ":__support_macros_optimization",
-        ":common_constants",
-        ":explogxf",
+        ":__support_math_exp10",
+        ":errno",
     ],
 )
 
 libc_math_function(
     name = "exp10f",
     additional_deps = [
-        ":exp10f_impl",
+        ":__support_math_exp10f",
+        ":errno",
     ],
 )
 
@@ -3684,14 +3738,13 @@ libc_math_function(
         ":__support_fputil_multiply_add",
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
         ":__support_fputil_sqrt",
         ":__support_fputil_triple_double",
         ":__support_macros_optimization",
+        ":__support_math_exp10f",
         ":common_constants",
         ":explogxf",
         ":exp2f_impl",
-        ":exp10f_impl",
     ],
 )
 
@@ -3800,7 +3853,6 @@ libc_math_function(
     name = "sinf",
     additional_deps = [
         ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
         ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
@@ -3970,6 +4022,17 @@ libc_math_function(
     ],
 )
 
+libc_math_function(
+    name = "tanpif",
+    additional_deps = [
+        ":sincosf_utils",
+        ":hdr_fenv_macros",
+        ":__support_macros_config",
+        ":__support_macros_optimization",
+        ":__support_fputil_multiply_add",
+    ],
+)
+
 libc_math_function(
     name = "tanpif16",
     additional_deps = [
@@ -4406,7 +4469,9 @@ libc_support_library(
         "src/string/memory_utils/aarch64/inline_memcpy.h",
         "src/string/memory_utils/aarch64/inline_memmove.h",
         "src/string/memory_utils/aarch64/inline_memset.h",
+        "src/string/memory_utils/arm/common.h",
         "src/string/memory_utils/arm/inline_memcpy.h",
+        "src/string/memory_utils/arm/inline_memset.h",
         "src/string/memory_utils/generic/aligned_access.h",
         "src/string/memory_utils/generic/byte_per_byte.h",
         "src/string/memory_utils/inline_bcmp.h",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
index 23cead18de4c1..d20922df1b8b5 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
@@ -530,6 +530,14 @@ math_mpfr_test(
     ],
 )
 
+math_mpfr_test(
+    name = "tanpif",
+    hdrs = ["sdcomp26094.h"],
+    deps = [
+        "//libc:__support_cpp_array",
+    ],
+)
+
 math_mpfr_test(
     name = "sqrt",
     hdrs = ["SqrtTest.h"],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
index cf25376878347..4df1c4508d764 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
@@ -1518,6 +1518,8 @@ math_test(name = "tanf")
 
 math_test(name = "tanhf")
 
+math_test(name = "tanpif")
+
 math_test(
     name = "totalorder",
     hdrs = ["TotalOrderTest.h"],
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f5f0d92685e0c..683885e1d4123 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3958,6 +3958,7 @@ cc_library(
         ":BufferizationToMemRef",
         ":ComplexToLLVM",
         ":ComplexToLibm",
+        ":ComplexToROCDLLibraryCalls",
         ":ComplexToSPIRV",
         ":ComplexToStandard",
         ":ControlFlowToLLVM",
@@ -11900,6 +11901,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ComplexToROCDLLibraryCalls",
+    srcs = glob([
+        "lib/Conversion/ComplexToROCDLLibraryCalls/*.cpp",
+    ]),
+    hdrs = glob([
+        "include/mlir/Conversion/ComplexToROCDLLibraryCalls/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":ComplexDialect",
+        ":ConversionPassIncGen",
+        ":FuncDialect",
+        ":IR",
+        ":Pass",
+        ":TransformUtils",
+    ],
+)
+
 cc_library(
     name = "ComplexToSPIRV",
     srcs = glob([
@@ -13657,17 +13677,17 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
-        ":XeVMDialect",
         ":ConversionPassIncGen",
-	":ConvertToLLVMInterface",
-	":GPUDialect",
+        ":ConvertToLLVMInterface",
+        ":GPUDialect",
         ":IR",
-	":LLVMCommonConversion",
-	":LLVMDialect",
+        ":LLVMCommonConversion",
+        ":LLVMDialect",
         ":Pass",
-	":Support",
+        ":Support",
         ":TransformUtils",
         ":VectorDialect",
-	"//llvm:Support",
+        ":XeVMDialect",
+        "//llvm:Support",
     ],
 )

From faf7d914093c87804e9dbca349b1a2bca0aefd18 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 24 Jul 2025 13:56:18 -0700
Subject: [PATCH 2/3] updated test

Created using spr 1.3.4
---
 bolt/test/X86/unclaimed-jt-entries.s | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/bolt/test/X86/unclaimed-jt-entries.s b/bolt/test/X86/unclaimed-jt-entries.s
index 1102e4ae413e2..b5c5abfbedebc 100644
--- a/bolt/test/X86/unclaimed-jt-entries.s
+++ b/bolt/test/X86/unclaimed-jt-entries.s
@@ -18,6 +18,16 @@
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
+
+## Check that non-simple function profile is emitted in perf2bolt mode
+# RUN: link_fdata %s %t.exe %t.pa PREAGG
+# RUN: llvm-strip -N L5 -N L5_ret %t.exe
+# RUN: perf2bolt %t.exe -p %t.pa --pa -o %t.fdata -strict=0 -print-profile \
+# RUN:   -print-only=main | FileCheck %s --check-prefix=CHECK-P2B
+# CHECK-P2B: PERF2BOLT: traces mismatching disassembled function contents: 0
+# CHECK-P2B: Binary Function "main"
+# CHECK-P2B: IsSimple : 0
+
 # RUN: llvm-bolt %t.exe -v=1 -o %t.out 2>&1 | FileCheck %s
 
 # CHECK: BOLT-WARNING: unclaimed data to code reference (possibly an unrecognized jump table entry) to .Ltmp[[#]] in main
@@ -33,8 +43,10 @@
   .size main, .Lend-main
 main:
   jmp *L4-24(,%rdi,8)
-.L5:
+# PREAGG: T #main# #L5# #L5_ret# 1
+L5:
   movl $4, %eax
+L5_ret:
   ret
 .L9:
   movl $2, %eax
@@ -58,7 +70,7 @@ L4:
   .quad .L3
   .quad .L6
   .quad .L3
-  .quad .L5
+  .quad L5
   .quad .L3
   .quad .L3
   .quad .L3

From 488ec264371f527b01b3c4bdebee8e65168f2695 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 24 Jul 2025 13:57:48 -0700
Subject: [PATCH 3/3] updated test

Created using spr 1.3.4
---
 bolt/test/X86/unclaimed-jt-entries.s | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bolt/test/X86/unclaimed-jt-entries.s b/bolt/test/X86/unclaimed-jt-entries.s
index b5c5abfbedebc..31b72c47125ae 100644
--- a/bolt/test/X86/unclaimed-jt-entries.s
+++ b/bolt/test/X86/unclaimed-jt-entries.s
@@ -27,6 +27,8 @@
 # CHECK-P2B: PERF2BOLT: traces mismatching disassembled function contents: 0
 # CHECK-P2B: Binary Function "main"
 # CHECK-P2B: IsSimple : 0
+# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA
+# CHECK-FDATA: 1 main 0 1 main 7 0 1
 
 # RUN: llvm-bolt %t.exe -v=1 -o %t.out 2>&1 | FileCheck %s